def set_table_delimiter(self, file_path): """Get the delimiter from the data file and set it.""" if os.name == "nt": dataset_file = open_fr(file_path) else: dataset_file = open_fr(file_path, encoding=self.encoding) self.auto_get_delimiter(dataset_file.readline()) dataset_file.close()
def json2csv(input_file, output_file=None, header_values=None, encoding=ENCODING): """Convert Json file to CSV. Function is used for only testing and can handle the file of the size. """ file_out = open_fr(input_file, encoding=encoding) # set output file name and write header if output_file is None: output_file = os.path.splitext( os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(output_file, encoding=encoding) if os.name == 'nt': outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", lineterminator='\n', fieldnames=header_values) else: outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", fieldnames=header_values) raw_data = json.loads(file_out.read()) outfile.writeheader() for item in raw_data: outfile.writerow(item) file_out.close() subprocess.call(['rm', '-r', input_file]) return output_file
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine for key in self.urls: original_file_name = "trade_prdct_{}.txt".format(key) new_file_name = "trade_prdct_{}.csv".format(key) engine.download_file(self.urls[key], original_file_name) old_path = self.engine.format_filename(original_file_name) new_path = self.engine.format_filename(new_file_name) # Re-write the file with one delimeter old_data = open_fr(old_path) new_data = open_fw(new_path) # Read header line and convert "," to "|" line1 = old_data.readline().strip().replace(",", "|") new_data.write(line1 + "\n") for line in old_data: # Remove leading "|" from the data new_data.write(line.strip("|")) new_data.close() old_data.close() table = Table(key, delimiter="|") engine.auto_create_table(table, filename=new_file_name) engine.insert_data_from_file(new_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine for key in self.urls: original_file_name = "trade_prdct_{}.txt".format(key) new_file_name = "trade_prdct_{}.csv".format(key) engine.download_file(self.urls[key], original_file_name) old_path = self.engine.format_filename(original_file_name) new_path = self.engine.format_filename(new_file_name) # Re-write the file with one delimeter old_data = open_fr(old_path) new_data = open_fw(new_path) # Read header line and convert "," to "|" line1 = old_data.readline().strip().replace(",", "|") new_data.write(line1 + "\n") for line in old_data: # Remove leading "|" from the data new_data.write(line.strip("|")) new_data.close() old_data.close() table = Table(key, delimiter="|") engine.auto_create_table(table, filename=new_file_name) engine.insert_data_from_file(new_path)
def xml2csv_test(input_file, outputfile=None, header_values=None, row_tag="row"): """Convert xml to csv. Function is used for only testing and can handle the file of the size. """ file_output = open_fr(input_file, encoding=ENCODING) # set output file name and write header if outputfile is None: outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(outputfile) if os.name == 'nt': csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\', lineterminator='\n') else: csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\') v = file_output.read() csv_writer.writerow(header_values) tree = ET.parse(NewFile(v)) root = tree.getroot() for rows in root.findall(row_tag): x = [ name.text for name in header_values for name in rows.findall(name) ] csv_writer.writerow(x) file_output.close() subprocess.call(['rm', '-r', input_file]) return outputfile
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file( self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename( "Succession_sampling_03-07_data.txt") old_data = open_fr( self.engine.find_file( "Succession_sampling_03-07_data_original.txt")) new_data = open_fw(data_path) # original file's header contains an end of line charactor in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table( self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr( self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table( key, cleanup=self.cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def read_json(json_file, debug=False): """Read Json dataset package files""" json_object = OrderedDict() json_file = str(json_file) + ".json" try: json_object = json.load(open_fr(json_file)) except ValueError: pass if type(json_object) is dict and "resources" in json_object.keys(): # Note::formats described by frictionlessdata data may need to change tabular_formats = ["csv", "tab"] vector_formats = ["shp", "kmz"] raster_formats = ["tif","tiff" "bil", ".hdr", "h5","hdf5", "hr", "image"] for resource_item in json_object["resources"]: if "format" in resource_item: if resource_item["format"] in tabular_formats: resource_item["format"] = "tabular" elif resource_item["format"] in vector_formats: resource_item["format"] = "vector" elif resource_item["format"] in raster_formats: resource_item["format"] = "raster" else: resource_item["format"] = "tabular" # Check for required resource fields spec_list = ["name", "url"] rspec = set(spec_list) if not rspec.issubset(resource_item.keys()): raise ValueError("One of the required attributes is missing from the {} dataset script.\ Make sure it has all of the following attributes: {}".format(json_file, rspec)) for spec in spec_list: if not resource_item[spec]: raise ValueError("Check either {} for missing values.\n Package {}".format(rspec, json_file)) json_object["tables"] = {} temp_tables = {} table_names = [item["name"] for item in json_object["resources"]] temp_tables["tables"] = dict(zip(table_names, json_object["resources"])) for table_name, table_spec in temp_tables["tables"].items(): json_object["tables"][table_name] = myTables[temp_tables["tables"][table_name]["format"]](**table_spec) json_object.pop("resources", None) json_object["urls"] = {table: json_object["tables"][table].url for table in json_object["tables"]} return TEMPLATES["default"](**json_object) return None
def load_data(self, filename): """Generator returning lists of values from lines in a data file. 1. Works on both delimited (csv module) and fixed width data (extract_fixed_width) 2. Identifies the delimiter if not known 3. Removes extra line endings """ if not self.table.delimiter: self.set_table_delimiter(filename) if os.name == "nt": dataset_file = open_fr(filename) else: dataset_file = open_fr(filename, encoding=self.encoding) if self.table.fixed_width: for row in dataset_file: yield self.extract_fixed_width(row) else: reg = re.compile("\\r\\n|\n|\r") for row in csv.reader(dataset_file, delimiter=self.table.delimiter): yield [reg.sub(" ", values) for values in row]
def sort_csv(filename, encoding=ENCODING): """Sort CSV rows minus the header and return the file. Function is used for only testing and can handle the file of the size. """ filename = os.path.normpath(filename) input_file = open_fr(filename, encoding) csv_reader_infile = csv.reader(input_file, escapechar="\\") # write the data to a temporary file and sort it temp_path = os.path.normpath("tempfile") temp_file = open_fw(temp_path, encoding) csv_writer = open_csvw(temp_file) i = 0 infields = None for row in csv_reader_infile: if i == 0: # The first entry is the header line infields = row i += 1 else: csv_writer.writerow(row) input_file.close() temp_file.close() # sort the temp file sorted_txt = sort_file(temp_path, encoding) tmp = open_fr(sorted_txt, encoding) in_txt = csv.reader(tmp, delimiter=',', escapechar="\\") csv_file = open_fw(filename, encoding) csv_writer = open_csvw(csv_file) csv_writer.writerow(infields) csv_writer.writerows(in_txt) tmp.close() csv_file.close() os.remove(os.path.normpath(temp_path)) return filename
def sort_file(file_path, encoding=ENCODING): """Sort file by line and return the file. Function is used for only testing and can handle the file of the size. """ file_path = os.path.normpath(file_path) input_file = open_fr(file_path, encoding) lines = [line.strip() for line in input_file] input_file.close() outfile = open_fw(file_path, encoding) lines.sort() for line in lines: outfile.write(line + "\n") outfile.close() return file_path
def disconnect(self): """Close out the xml files Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a closing tag) """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.write(u'\n</root>') current_output_file.close() self.table_names = []
def disconnect(self): """Close out the JSON with a `\\n]}` and close the file. Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a `\\n]}`. """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',\n') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.writelines(['\n]']) current_output_file.close() self.table_names = []
def disconnect(self): """Close out the JSON with a `\\n]}` and close the file. Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a `\\n]}`. """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',\n') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.writelines(['\n]']) current_output_file.close() self.table_names = []
def convert_to_csv(dir_name): """Change the file delimiter to comma delimiter""" for file_name in os.listdir(dir_name): file_path = os.path.join(dir_name, file_name) if file_path.endswith(".txt"): csv_file_name = file_name.replace(".txt", ".csv").lower() output_file = os.path.join(dir_name, csv_file_name) with open_fr(file_path, encoding="latin-1") as read_object, open_fw( output_file) as outputfw: fr = csv.reader(read_object, delimiter="^", quotechar="~") fw = csv.writer(outputfw, delimiter=",", quoting=csv.QUOTE_MINIMAL) for line in fr: if line: fw.writerow(line) # delete the text files os.remove(file_path)
def load_data(self, filename): """Generator returning lists of values from lines in a data file. 1. Works on both delimited (csv module) and fixed width data (extract_fixed_width) 2. Identifies the delimiter if not known 3. Removes extra line endings """ if not self.table.delimiter: self.set_table_delimiter(filename) dataset_file = open_fr(filename) if self.table.fixed_width: for row in dataset_file: yield self.extract_fixed_width(row) else: reg = re.compile("\\r\\n|\n|\r") for row in csv.reader(dataset_file, delimiter=self.table.delimiter): yield [reg.sub(" ", values) for values in row]
def download(self, engine=None, debug=False): Script.download(self, engine, debug) original_data = "Succession_sampling_03-07_data_original.txt" new_date = "Succession_sampling_03-07_data.txt" self.engine.download_file(self.urls["main"], original_data) data_path = self.engine.format_filename(new_date) old_data = open_fr(self.engine.find_file(original_data)) new_data = open_fw(data_path) # original file's header contains an end of line charactor # in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table(self.tables["main"], filename=new_date) self.engine.insert_data_from_file(data_path)
def json2csv(input_file, output_file=None, header_values=None, encoding=ENCODING, row_key=None): """Convert Json file to CSV.""" file_out = open_fr(input_file, encoding=encoding) # set output file name and write header if output_file is None: output_file = os.path.splitext( os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(output_file, encoding=encoding) if os.name == 'nt': outfile = csv.writer(csv_out, dialect='excel', escapechar="\\", lineterminator='\n') else: outfile = csv.writer(csv_out, dialect='excel', escapechar="\\") raw_data = json.loads(file_out.read(), object_pairs_hook=OrderedDict) raw_data, header_values = walker(raw_data, row_key=row_key, header_values=header_values, rows=[], normalize=False) if isinstance(raw_data[0], dict): # row values are in a list of dictionaries raw_data = [list(row.values()) for row in raw_data] else: raw_data = [row.tolist() for row in raw_data] if header_values: outfile.writerow(header_values) outfile.writerows(raw_data) file_out.close() subprocess.call(['rm', '-r', input_file]) return output_file
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table(key, cleanup=self.cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def set_table_delimiter(self, file_path): """Get the delimiter from the data file and set it.""" dataset_file = open_fr(file_path) self.auto_get_delimiter(dataset_file.readline()) dataset_file.close()
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o" } newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int", )), ("regioncode", ("int", )), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns = [("record_id", ("pk-auto", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("Year", ("int", )), ("Aou", ("int", )), ("Count10", ("int", )), ("Count20", ("int", )), ("Count30", ("int", )), ("Count40", ("int", )), ("Count50", ("int", )), ("StopTotal", ("int", )), ("SpeciesTotal", ("int", ))] stateslist = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon" ] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def set_table_delimiter(self, file_path): dataset_file = open_fr(file_path) self.auto_get_delimiter(dataset_file.readline()) dataset_file.close()
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # Complete Plants Checklist file_name = "complete_plant_checklist.csv" table_name = "complete_plant_checklist" complete_plant_url = "https://plants.sc.egov.usda.gov/java/downloadData?fileName=plantlst.txt&static=true" self.engine.download_file(complete_plant_url, filename=file_name) data_path = self.engine.format_filename(file_name) table = Table(table_name, delimiter=",") table.columns = [ ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("common_name", ("char", "42")), ("family", ("char", "30")), ] self.engine.auto_create_table(table, filename=file_name) self.engine.insert_data_from_file(data_path) # Symbols for Unknown Plants file_name = "symbols_unknown_plants.csv" table_name = "unknown_plants" unknown_plants_url = "https://plants.sc.egov.usda.gov/Data/unknown_plants.txt" self.engine.download_file(unknown_plants_url, filename=file_name) data_path = self.engine.format_filename(file_name) table = Table(table_name, delimiter=",") table.columns = [("symbol", ("char", "7")), ("common_name", ("char", "56"))] self.engine.auto_create_table(table, filename=file_name) self.engine.insert_data_from_file(data_path) # State PLANTS Checklist base_url = "https://plants.sc.egov.usda.gov/" state_plant_checklist_base_url = "{base}java/stateDownload?statefips={id}" state_plant_checklist_file = "all_state_plant_checklist.csv" table_name = "state_plant_checklist" state_plant_checklist = [ ("US01", "Alabama", "US"), ("US02", "Alaska", "US"), ("US05", "Arkansas", "US"), ("US04", "Arizona", "US"), ("US06", "California", "US"), ("US08", "Colorado", "US"), ("US09", "Connecticut", "US"), ("US10", "Delaware", "US"), ("US11", "District of Columbia", "US"), ("US12", "Florida", "US"), ("US13", "Georgia", "US"), ("US15", "Hawaii", "US"), ("US16", "Idaho", "US"), ("US17", "Illinois", "US"), ("US18", "Indiana", "US"), ("US19", "Iowa", "US"), ("US20", "Kansas", "US"), ("US21", "Kentucky", "US"), ("US22", "Louisiana", "US"), ("US23", "Maine", "US"), ("US24", "Maryland", "US"), ("US25", "Massachusetts", "US"), ("US26", "Michigan", "US"), ("US27", "Minnesota", "US"), ("US28", "Mississippi", "US"), ("US29", "Missouri", "US"), ("US30", "Montana", "US"), ("US31", "Nebraska", "US"), ("US32", "Nevada", "US"), ("US33", "New Hampshire", "US"), ("US34", "New Jersey", "US"), ("US35", "New Mexico", "US"), ("US36", "New York", "US"), ("US37", "North Carolina", "US"), ("US38", "North Dakota", "US"), ("US39", "Ohio", "US"), ("US40", "Oklahoma", "US"), ("US41", "Oregon", "US"), ("US42", "Pennsylvania", "US"), ("US44", "Rhode Island", "US"), ("US45", "South Carolina", "US"), ("US46", "South Dakota", "US"), ("US47", "Tennessee", "US"), ("US48", "Texas", "US"), ("US49", "Utah", "US"), ("US50", "Vermont", "US"), ("US51", "Virginia", "US"), ("US53", "Washington", "US"), ("US54", "West Virginia", "US"), ("US55", "Wisconsin", "US"), ("US56", "Wyoming", "US"), ("US72", "Puerto Rico", "US"), ("US78", "Virgin Islands", "US"), ("CA01", "Alberta", "Canada"), ("CA02", "British Columbia", "Canada"), ("CA03", "Manitoba", "Canada"), ("CA04", "New Brunswick", "Canada"), ("CALB", "Labrador", "Canada"), ("CANF", "Newfoundland", "Canada"), ("CA13", "Northwest Territories", "Canada"), ("CA07", "Nova Scotia", "Canada"), ("CA14", "Nunavut", "Canada"), ("CA08", "Ontario", "Canada"), ("CA09", "Prince Edward Island", "Canada"), ("CA10", "Québec", "Canada"), ("CA11", "Saskatchewan", "Canada"), ("CA12", "Yukon", "Canada"), ("GL", "Greenland", "Denmark"), ("SB", "St. Pierre and Miquelon", "France"), ] with open_fw(engine.format_filename( state_plant_checklist_file)) as write_object: csv_writer = open_csvw(write_object) for state_info in state_plant_checklist: file_name = state_info[1].replace(".", "").replace( " ", "_").lower() + ".csv" file_name = "old_state_plant_checklist_" + file_name state_url = state_plant_checklist_base_url.format( base=base_url, id=state_info[0]) self.engine.download_file(state_url, filename=file_name) with open_fr(engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) for row in csv.reader(read_object, delimiter=","): csv_writer.writerow([state_info[2]] + [state_info[1]] + row) data_path = self.engine.format_filename(state_plant_checklist_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("country", ("char", "7")), ("state", ("char", "23")), ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("national_common_name", ("char", "42")), ("family", ("char", "17")), ] self.engine.auto_create_table(table, filename=state_plant_checklist_file) self.engine.insert_data_from_file(data_path) # NRCS State GSAT Lists base_url = "https://www.plants.usda.gov/" nrcs_state_gsat_base_url = "{base}java/gsatDownload?gsatid={id}" nrcs_state_gsat_file = "all_nrcs_state_gsat.csv" table_name = "nrcs_state_gsat" nrcs_state_gsat = [ ("Alabama", "2"), ("Alaska", ""), ("Arkansas", ""), ("Arizona", "2"), ("California", ""), ("Colorado", ""), ("Connecticut", ""), ("Delaware", ""), ("Florida", ""), ("Georgia", ""), ("Hawaii", ""), ("Idaho", "9"), ("Illinois", ""), ("Indiana", ""), ("Iowa ", ""), ("Kansas", "6"), ("Kentucky", ""), ("Louisiana", "16"), ("Maine", ""), ("Maryland", ""), ("Massachusetts", ""), ("Michigan", ""), ("Minnesota", "11"), ("Mississippi", ""), ("Missouri", "14"), ("Montana", ""), ("Nebraska", "17"), ("Nevada", "4"), ("New Hampshire", ""), ("New Jersey ", ""), ("New Mexico", "1"), ("New York", ""), ("Noth Carolina", ""), ("North Dakota", "5"), ("Ohio", ""), ("Oklahoma", "12"), ("Oregon", "3"), ("Pennsylvania", "15"), ("Rhode Island", ""), ("South Carolina", ""), ("South Dakota", "7"), ("Tennessee", ""), ("Texas", "13"), ("Utah", ""), ("Vermont ", ""), ("Virginia", ""), ("Washington", "8"), ("West Virginia", ""), ("Wisconsin", ""), ("Wyoming", "10"), ] with open_fw( engine.format_filename(nrcs_state_gsat_file)) as write_object: for state_info in nrcs_state_gsat: if state_info[1]: # skip states with no data ("state", ""), file_name = state_info[0].replace(" ", "_").replace( ".", "").lower() + ".csv" file_name = "old_nrcs_state_gsat_" + file_name state_url = nrcs_state_gsat_base_url.format( base=base_url, id=state_info[1]) self.engine.download_file(state_url, filename=file_name) with open_fr( engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) state_quoted = '"{state}",'.format(state=state_info[0]) for line in read_object: write_object.write(state_quoted + line) data_path = self.engine.format_filename(nrcs_state_gsat_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("state", ("char", "12")), ("symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("gsat_common_name", ("char", "93")), ] self.engine.auto_create_table(table, filename=nrcs_state_gsat_file) self.engine.insert_data_from_file(data_path) base_url = "https://plants.sc.egov.usda.gov/" nrcs_state_plant_lists_url = "{base}java/nrcsStateDownload?statefips={id}" nrcs_state_plant_file = "all_nrcs_state_plant.csv" table_name = "nrcs_state_plant" nrcs_state_plant_lists = [ ("01", "Alabama"), ("02", "Alaska"), ("05", "Arkansas"), ("04", "Arizona"), ("06", "California"), ("08", "Colorado"), ("09", "Connecticut"), ("10", "Delaware"), ("12", "Florida"), ("13", "Georgia"), ("15", "Hawaii"), ("16", "Idaho"), ("17", "Illinois"), ("18", "Indiana"), ("19", "Iowa"), ("20", "Kansas"), ("21", "Kentucky"), ("22", "Louisiana"), ("23", "Maine"), ("24", "Maryland"), ("25", "Massachusetts"), ("26", "Michigan"), ("27", "Minnesota"), ("28", "Mississippi"), ("29", "Missouri"), ("30", "Montana"), ("31", "Nebraska"), ("32", "Nevada"), ("33", "New Hampshire"), ("34", "New Jersey"), ("35", "New Mexico"), ("36", "New York"), ("37", "North Carolina"), ("38", "North Dakota"), ("39", "Ohio"), ("40", "Oklahoma"), ("41", "Oregon"), ("42", "Pennsylvania"), ("44", "Rhode Island"), ("45", "South Carolina"), ("46", "South Dakota"), ("47", "Tennessee"), ("48", "Texas"), ("49", "Utah"), ("50", "Vermont"), ("51", "Virginia"), ("53", "Washington"), ("54", "West Virginia"), ("55", "Wisconsin"), ("56", "Wyoming"), ("72", "Puerto Rico"), ("78", "Virgin Islands"), ] with open_fw( engine.format_filename(nrcs_state_plant_file)) as write_object: for state_info in nrcs_state_plant_lists: file_name = state_info[1].replace(" ", "_").replace( ".", "").lower() + ".csv" file_name = "old_nrcs_state_plant_" + file_name state_url = nrcs_state_plant_lists_url.format(base=base_url, id=state_info[0]) self.engine.download_file(state_url, filename=file_name) with open_fr(engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) state_quoted = '"{state}",'.format(state=state_info[1]) for line in read_object: write_object.write(state_quoted + line) data_path = self.engine.format_filename(nrcs_state_plant_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("state", ("char", "17")), ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("state_common_name", ("char", "42")), ("family", ("char", "17")), ] self.engine.auto_create_table(table, filename=nrcs_state_plant_file) self.engine.insert_data_from_file(data_path)
def set_table_delimiter(self, file_path): dataset_file = open_fr(file_path) self.auto_get_delimiter(dataset_file.readline()) dataset_file.close()
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # download and create species table table = Table('species') self.engine.auto_create_table(table, url=self.urls['species']) self.engine.insert_data_from_url(self.urls['species']) # State abbreviations with the year annual inventory began for that state stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000), ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004), ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001), ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999), ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003), ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006), ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002), ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003), ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001), ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001), ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003), ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000), ('WY', 2000), ('PR', 2001)] tablelist = [ "SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE", "SEEDLING" ] for table in tablelist: for state, year in stateslist: engine.download_files_from_archive( self.urls["main"] + state + "_" + table + ".ZIP", [state + "_" + table + ".csv"]) for table in tablelist: print("Scanning data for table %s..." % table) prep_file_name = "%s.csv" % table prep_file = open_fw(engine.format_filename(prep_file_name)) this_file = open_fr( engine.format_filename(stateslist[0][0] + "_" + table + ".csv")) col_names = this_file.readline() prep_file.write(col_names) column_names = [col.strip('"') for col in col_names.split(',')] year_column = column_names.index("INVYR") this_file.close() for state, year in stateslist: this_file = open_fr( engine.format_filename(state + "_" + table + ".csv")) this_file.readline() for line in this_file: values = line.split(',') this_year = values[year_column] if int(this_year) >= year: prep_file.write(line) prep_file.close() engine.auto_create_table(Table(table), filename=prep_file_name) engine.insert_data_from_file( engine.format_filename(prep_file_name)) try: os.remove(engine.format_filename(prep_file_name)) except: pass return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o" } newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int", )), ("regioncode", ("int", )), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns = [("RouteDataID", ("int", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("year", ("int", )), ("AOU", ("int", )), ("Stop1", ("int", )), ("Stop2", ("int", )), ("Stop3", ("int", )), ("Stop4", ("int", )), ("Stop5", ("int", )), ("Stop6", ("int", )), ("Stop7", ("int", )), ("Stop8", ("int", )), ("Stop9", ("int", )), ("Stop10", ("int", )), ("Stop11", ("int", )), ("Stop12", ("int", )), ("Stop13", ("int", )), ("Stop14", ("int", )), ("Stop15", ("int", )), ("Stop16", ("int", )), ("Stop17", ("int", )), ("Stop18", ("int", )), ("Stop19", ("int", )), ("Stop20", ("int", )), ("Stop21", ("int", )), ("Stop22", ("int", )), ("Stop23", ("int", )), ("Stop24", ("int", )), ("Stop25", ("int", )), ("Stop26", ("int", )), ("Stop27", ("int", )), ("Stop28", ("int", )), ("Stop29", ("int", )), ("Stop30", ("int", )), ("Stop31", ("int", )), ("Stop32", ("int", )), ("Stop33", ("int", )), ("Stop34", ("int", )), ("Stop35", ("int", )), ("Stop36", ("int", )), ("Stop37", ("int", )), ("Stop38", ("int", )), ("Stop39", ("int", )), ("Stop40", ("int", )), ("Stop41", ("int", )), ("Stop42", ("int", )), ("Stop43", ("int", )), ("Stop44", ("int", )), ("Stop45", ("int", )), ("Stop46", ("int", )), ("Stop47", ("int", )), ("Stop48", ("int", )), ("Stop49", ("int", )), ("Stop50", ("int", ))] part = "" engine.table = table engine.create_table() for part in range(1, 11): part = str(part) try: print("Inserting data from part " + part + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("Failed bulk insert on " + part + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("There was an error in part " + part + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def read_json(json_file): """Read Json dataset package files Load each json and get the appropriate encoding for the dataset Reload the json using the encoding to ensure correct character sets """ json_object = OrderedDict() json_file_encoding = None json_file = str(json_file) + ".json" try: file_obj = open_fr(json_file) json_object = json.load(file_obj) if "encoding" in json_object: json_file_encoding = json_object['encoding'] file_obj.close() except ValueError: return None # Reload json using encoding if available try: if json_file_encoding: file_obj = open_fr(json_file, encoding=json_file_encoding) else: file_obj = open_fr(json_file) json_object = json.load(file_obj) file_obj.close() except ValueError: return None if isinstance(json_object, dict) and "resources" in json_object.keys(): # Note::formats described by frictionless data may need to change tabular_exts = {"csv", "tab", "geojson", "sqlite", "db", "json", "xml"} vector_exts = {"shp", "kmz"} raster_exts = { "tif", "tiff", "bil", "hdr", "h5", "hdf5", "hr", "image" } for resource_item in json_object["resources"]: if "format" not in resource_item: if "format" in json_object: resource_item["format"] = json_object["format"] else: resource_item["format"] = "tabular" if "extensions" in resource_item: exts = set(resource_item["extensions"]) if exts <= tabular_exts: resource_item["format"] = "tabular" elif exts <= vector_exts: resource_item["format"] = "vector" elif exts <= raster_exts: resource_item["format"] = "raster" if "url" in resource_item: if "urls" in json_object: json_object["urls"][ resource_item["name"]] = resource_item["url"] json_object["tables"] = OrderedDict() temp_tables = {} table_names = [item["name"] for item in json_object["resources"]] temp_tables["tables"] = OrderedDict( zip(table_names, json_object["resources"])) for table_name, table_spec in temp_tables["tables"].items(): json_object["tables"][table_name] = myTables[ temp_tables["tables"][table_name]["format"]](**table_spec) json_object.pop("resources", None) return TEMPLATES["default"](**json_object) return None
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine original_sql_file = "BioTIMESQL02_04_2018.sql" engine.download_file(self.urls["sql_file"], original_sql_file) sql_data = open_fr(self.engine.format_filename(original_sql_file)) set_open = False csv_writer = None csv_file = None table_name = None NULL = None for line in sql_data: table_indicator = "-- Table structure for table " if line.startswith(table_indicator): st = line[len(table_indicator):].replace("`", "") table_name = st.strip() current_file_process = table_name current_file_open = current_file_process if set_open and not current_file_process == current_file_open: csv_file.close() set_open = False else: out_file = "{name}.csv".format(name=table_name) csv_file = open_fw(engine.format_filename(out_file)) csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) set_open = True if line.startswith("INSERT INTO `{table_name}`".format( table_name=table_name)): row_val = line[line.index("VALUES (") + 8:-3] table_rows = row_val.replace("\r\n", "").split("),(") for i_row in table_rows: v = eval('[' + str(i_row) + ']') csv_writer.writerows([v]) if csv_file: csv_file.close() # Create abundance table table = Table("ID_ABUNDANCE", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_ABUNDANCE", ("int", )), ("ABUNDANCE_TYPE", ("char", "100")), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("abundance.csv")) # Create allrawdata table table = Table("allrawdata", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_ALL_RAW_DATA", ("int", )), ("ABUNDANCE", ("double", )), ("BIOMASS", ("double", )), ("ID_SPECIES", ("int", )), ("SAMPLE_DESC", ("char", 200)), ("PLOT", ("char", 150)), ("LATITUDE", ("double", )), ("LONGITUDE", ("double", )), ("DEPTH", ("double", )), ("DAY", ("int", )), ("MONTH", ("int", )), ("YEAR", ("int", )), ("STUDY_ID", ("int", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("allrawdata.csv")) # Create biomass table table = Table("biomass", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("ID_BIOMASS", ("int", )), ("BIOMASS_TYPE", ("char", "100"))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("biomass.csv")) # Create citation1 table table = Table("citation1", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_CITATION1", ("int", )), ("STUDY_ID", ("int", )), ("CITATION_LINE", ("char", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("citation1.csv")) # Create contacts table table = Table("contacts", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_CONTACTS", ("int", )), ("STUDY_ID", ("int", )), ("CONTACT_1", ("char", 500)), ("CONTACT_2", ("char", 500)), ("CONT_1_MAIL", ("char", 60)), ("CONT_2_MAIL", ("char", 60)), ("LICENSE", ("char", 200)), ("WEB_LINK", ("char", 200)), ("DATA_SOURCE", ("char", 250)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("contacts.csv")) # Create countries table table = Table("countries", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("COUNT_ID", ("int", )), ("COUNTRY_NAME", ("char", 200))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("countries.csv")) # Create curation table table = Table("curation", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_CURATION", ("int", )), ("STUDY_ID", ("int", )), ("LINK_ID", ("int", )), ("COMMENTS", ("char", )), ("DATE_STUDY_ADDED", ("char", 50)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("curation.csv")) # Create datasets table table = Table("datasets", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_DATASETS", ("int", )), ("STUDY_ID", ("int", )), ("TAXA", ("char", 50)), ("ORGANISMS", ("char", 200)), ("TITLE", ("char", 800)), ("AB_BIO", ("char", 2)), ("HAS_PLOT", ("char", 10)), ("DATA_POINTS", ("char", )), ("START_YEAR", ("char", )), ("END_YEAR", ("char", )), ("CENT_LAT", ("double", )), ("CENT_LONG", ("double", )), ("NUMBER_OF_SPECIES", ("char", )), ("NUMBER_OF_SAMPLES", ("char", )), ("NUMBER_LAT_LONG", ("char", )), ("TOTAL", ("char", )), ("GRAIN_SIZE_TEXT", ("char", )), ("GRAIN_SQ_KM", ("double", )), ("AREA_SQ_KM", ("double", )), ("AB_TYPE", ("char", )), ("BIO_TYPE", ("char", )), ("SAMPLE_TYPE", ("char", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("datasets.csv")) # Create downloads table table = Table("downloads", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("D_ID", ("int", )), ("STUDY", ("char", 25)), ("NAME", ("char", 150)), ("EMAIL", ("char", 150)), ("COUNTRY", ("char", 200)), ("ROLE", ("char", 150)), ("PURPOSE", ("char", 500)), ("LOCATION", ("char", 250)), ("DATE_STAMP", ("char", )), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("downloads.csv")) # Create methods table table = Table("methods", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_METHODS", ("int", )), ("STUDY_ID", ("int", )), ("METHODS", ("char", )), ("SUMMARY_METHODS", ("char", 500)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("methods.csv")) # Create sample table table = Table("sample", delimiter=",", header_rows=0, contains_pk=False) table.columns = [ ("ID_SAMPLE", ("int", )), ("ID_TREAT", ("int", )), ("SAMPLE_DESC_NAME", ("char", 200)), ] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("sample.csv")) # Create site table table = Table("site", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("ID_SITE", ("int", )), ("STUDY_ID", ("int", )), ("REALM", ("char", 11)), ("CLIMATE", ("char", 20)), ("GENERAL_TREAT", ("char", 200)), ("TREATMENT", ("char", 200)), ("TREAT_COMMENTS", ("char", 250)), ("TREAT_DATE", ("char", 100)), ("CEN_LATITUDE", ("double", )), ("CEN_LONGITUDE", ("double", )), ("HABITAT", ("char", 100)), ("PROTECTED_AREA", ("char", 50)), ("AREA", ("double", )), ("BIOME_MAP", ("char", 500))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("site.csv")) # Create species table table = Table("species", delimiter=",", header_rows=0, contains_pk=False) table.columns = [("ID_SPECIES", ("int", )), ("GENUS", ("char", 100)), ("SPECIES", ("char", 100)), ("GENUS_SPECIES", ("char", 100))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename("species.csv"))
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [("species_id", ("pk-int",)), ("AOU", ("int",)), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive( self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int",)), ("regioncode", ("int",)), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns = [("RouteDataID", ("int",)), ("countrynum", ("int",)), ("statenum", ("int",)), ("Route", ("int",)), ("RPID", ("int",)), ("year", ("int",)), ("AOU", ("int",)), ("Stop1", ("int",)), ("Stop2", ("int",)), ("Stop3", ("int",)), ("Stop4", ("int",)), ("Stop5", ("int",)), ("Stop6", ("int",)), ("Stop7", ("int",)), ("Stop8", ("int",)), ("Stop9", ("int",)), ("Stop10", ("int",)), ("Stop11", ("int",)), ("Stop12", ("int",)), ("Stop13", ("int",)), ("Stop14", ("int",)), ("Stop15", ("int",)), ("Stop16", ("int",)), ("Stop17", ("int",)), ("Stop18", ("int",)), ("Stop19", ("int",)), ("Stop20", ("int",)), ("Stop21", ("int",)), ("Stop22", ("int",)), ("Stop23", ("int",)), ("Stop24", ("int",)), ("Stop25", ("int",)), ("Stop26", ("int",)), ("Stop27", ("int",)), ("Stop28", ("int",)), ("Stop29", ("int",)), ("Stop30", ("int",)), ("Stop31", ("int",)), ("Stop32", ("int",)), ("Stop33", ("int",)), ("Stop34", ("int",)), ("Stop35", ("int",)), ("Stop36", ("int",)), ("Stop37", ("int",)), ("Stop38", ("int",)), ("Stop39", ("int",)), ("Stop40", ("int",)), ("Stop41", ("int",)), ("Stop42", ("int",)), ("Stop43", ("int",)), ("Stop44", ("int",)), ("Stop45", ("int",)), ("Stop46", ("int",)), ("Stop47", ("int",)), ("Stop48", ("int",)), ("Stop49", ("int",)), ("Stop50", ("int",))] part = "" engine.table = table engine.create_table() for part in range(1, 11): part = str(part) try: print("Inserting data from part " + part + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print( "Failed bulk insert on " + part + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("There was an error in part " + part + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [("species_id", ("pk-int",)), ("AOU", ("int",)), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int",)), ("regioncode", ("int",)), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns = [("record_id", ("pk-auto",)), ("RouteDataID", ("int",)), ("countrynum", ("int",)), ("statenum", ("int",)), ("Route", ("int",)), ("RPID", ("int",)), ("Year", ("int",)), ("Aou", ("int",)), ("Count10", ("int",)), ("Count20", ("int",)), ("Count30", ("int",)), ("Count40", ("int",)), ("Count50", ("int",)), ("StopTotal", ("int",)), ("SpeciesTotal", ("int",))] stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if isinstance(state, (list,)): state, shortstate = state[0], state[1] else: shortstate = state[0:7] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print( "Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def set_table_delimiter(self, file_path): """Get the delimiter from the data file and set it.""" dataset_file = open_fr(file_path) self.auto_get_delimiter(dataset_file.readline()) dataset_file.close()
def read_json(json_file, debug=False): """Read Json dataset package files Load each json and get the appropriate encoding for the dataset Reload the json using the encoding to ensure correct character sets """ json_object = OrderedDict() json_file_encoding = None json_file = str(json_file) + ".json" try: file_obj = open_fr(json_file) json_object = json.load(file_obj) if "encoding" in json_object: json_file_encoding = json_object['encoding'] file_obj.close() except ValueError: pass # Reload json using encoding if available try: if json_file_encoding: file_obj = open_fr(json_file, encoding=json_file_encoding) else: file_obj = open_fr(json_file) json_object = json.load(file_obj) file_obj.close() except ValueError: pass if type(json_object) is dict and "resources" in json_object.keys(): # Note::formats described by frictionlessdata data may need to change tabular_exts = {"csv", "tab"} vector_exts = {"shp", "kmz"} raster_exts = {"tif", "tiff", "bil", "hdr", "h5", "hdf5", "hr", "image"} for resource_item in json_object["resources"]: if "format" not in resource_item: if "format" in json_object: resource_item["format"] = json_object["format"] else: resource_item["format"] = "tabular" if "extensions" in resource_item: exts = set(resource_item["extensions"]) if exts <= tabular_exts: resource_item["format"] = "tabular" elif exts <= vector_exts: resource_item["format"] = "vector" elif exts <= raster_exts: resource_item["format"] = "raster" if "url" in resource_item: if "urls" in json_object: json_object["urls"][resource_item["name"]] = resource_item["url"] json_object["tables"] = OrderedDict() temp_tables = {} table_names = [item["name"] for item in json_object["resources"]] temp_tables["tables"] = OrderedDict(zip(table_names, json_object["resources"])) for table_name, table_spec in temp_tables["tables"].items(): json_object["tables"][table_name] = myTables[ temp_tables["tables"][table_name]["format"] ](**table_spec) json_object.pop("resources", None) return TEMPLATES["default"](**json_object) return None