def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine for key in self.urls: original_file_name = "trade_prdct_{}.txt".format(key) new_file_name = "trade_prdct_{}.csv".format(key) engine.download_file(self.urls[key], original_file_name) old_path = self.engine.format_filename(original_file_name) new_path = self.engine.format_filename(new_file_name) # Re-write the file with one delimeter old_data = open_fr(old_path) new_data = open_fw(new_path) # Read header line and convert "," to "|" line1 = old_data.readline().strip().replace(",", "|") new_data.write(line1 + "\n") for line in old_data: # Remove leading "|" from the data new_data.write(line.strip("|")) new_data.close() old_data.close() table = Table(key, delimiter="|") engine.auto_create_table(table, filename=new_file_name) engine.insert_data_from_file(new_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr( self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table( key, cleanup=self.cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def json2csv(input_file, output_file=None, header_values=None, encoding=ENCODING): """Convert Json file to CSV. Function is used for only testing and can handle the file of the size. """ file_out = open_fr(input_file, encoding=encoding) # set output file name and write header if output_file is None: output_file = os.path.splitext( os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(output_file, encoding=encoding) if os.name == 'nt': outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", lineterminator='\n', fieldnames=header_values) else: outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", fieldnames=header_values) raw_data = json.loads(file_out.read()) outfile.writeheader() for item in raw_data: outfile.writerow(item) file_out.close() subprocess.call(['rm', '-r', input_file]) return output_file
def create_table(self): """Create the table by creating an empty XML file.""" self.output_file = open_fw(self.table_name()) self.output_file.write(u'<?xml version="1.0" encoding="UTF-8"?>') self.output_file.write(u'\n<root>') self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1
def xml2csv_test(input_file, outputfile=None, header_values=None, row_tag="row"): """Convert xml to csv. Function is used for only testing and can handle the file of the size. """ file_output = open_fr(input_file, encoding=ENCODING) # set output file name and write header if outputfile is None: outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(outputfile) if os.name == 'nt': csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\', lineterminator='\n') else: csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\') v = file_output.read() csv_writer.writerow(header_values) tree = ET.parse(NewFile(v)) root = tree.getroot() for rows in root.findall(row_tag): x = [ name.text for name in header_values for name in rows.findall(name) ] csv_writer.writerow(x) file_output.close() subprocess.call(['rm', '-r', input_file]) return outputfile
def write_out_scripts(script_dict, path, out_path): """Writes scripts out to a given path""" names = os.path.basename(path) file_name = names.lower().replace("-", "_") + ".json" path_dir = get_directory(os.path.expanduser(path)) if out_path is not None: path_dir = os.path.expanduser(out_path) if not os.path.exists(path_dir): os.mkdir(path_dir) write_path = os.path.join(path_dir, file_name) if not (script_dict and "resources" in script_dict): print(write_path + " creation skipped because resources were empty.") return if os.path.exists(write_path): choice = clean_input(write_path + " already exists. Overwrite the script? [y/n]") if choice == "n": print(write_path + " creation skipped.") return try: with open_fw(write_path) as output_path: sorted_dict = collections.OrderedDict(script_dict.items()) json_str = json.dumps(sorted_dict, sort_keys=True, indent=4) output_path.write(json_str) print("Successfully wrote scripts to " + os.path.abspath(write_path)) output_path.close() except Exception as error: print(write_path + " could not be created. {}".format(error.message))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine for key in self.urls: original_file_name = "trade_prdct_{}.txt".format(key) new_file_name = "trade_prdct_{}.csv".format(key) engine.download_file(self.urls[key], original_file_name) old_path = self.engine.format_filename(original_file_name) new_path = self.engine.format_filename(new_file_name) # Re-write the file with one delimeter old_data = open_fr(old_path) new_data = open_fw(new_path) # Read header line and convert "," to "|" line1 = old_data.readline().strip().replace(",", "|") new_data.write(line1 + "\n") for line in old_data: # Remove leading "|" from the data new_data.write(line.strip("|")) new_data.close() old_data.close() table = Table(key, delimiter="|") engine.auto_create_table(table, filename=new_file_name) engine.insert_data_from_file(new_path)
def to_csv(self, sort=True, path=None): # Due to Cyclic imports we can not move this import to the top from retriever.lib.engine_tools import sort_csv for table_name in self.script_table_registry[self.script.name]: csv_file_output = os.path.normpath( os.path.join(path if path else '', table_name[0] + '.csv')) csv_file = open_fw(csv_file_output) csv_writer = open_csvw(csv_file) self.get_cursor() self.set_engine_encoding() self.cursor.execute("SELECT * FROM {};".format(table_name[0])) row = self.cursor.fetchone() column_names = [ u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description ] csv_writer.writerow(column_names) while row is not None: csv_writer.writerow(row) row = self.cursor.fetchone() csv_file.close() if sort: sort_csv(csv_file_output) self.disconnect()
def create_table(self): """Create the table by creating an empty csv file""" self.auto_column_number = 1 self.file = open_fw(self.table_name()) self.output_file = open_csvw(self.file) self.output_file.writerow([u'{}'.format(val) for val in self.table.get_insert_columns(join=False, create=True)]) self.table_names.append((self.file, self.table_name()))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file( self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename( "Succession_sampling_03-07_data.txt") old_data = open_fr( self.engine.find_file( "Succession_sampling_03-07_data_original.txt")) new_data = open_fw(data_path) # original file's header contains an end of line charactor in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table( self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def create_table(self): """Create the table by creating an empty csv file""" self.auto_column_number = 1 table_path = os.path.join(self.opts["data_dir"], self.table_name()) self.file = open_fw(table_path) self.output_file = open_csvw(self.file) column_list = self.table.get_insert_columns(join=False, create=True) self.output_file.writerow([u'{}'.format(val) for val in column_list]) self.table_names.append((self.file, table_path)) # Register all tables created to enable # testing python files having custom download function Engine.register_tables(self)
def create_table(self): """Create the table by creating an empty json file""" self.output_file = open_fw(self.table_name()) self.output_file.write("[") self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1 # Register all tables created to enable # testing python files having custom download function if self.script.name not in self.script_table_registry: self.script_table_registry[self.script.name] = [] self.script_table_registry[self.script.name].append( (self.table_name(), self.table))
def create_table(self): """Create the table by creating an empty json file""" table_path = os.path.join(self.opts["data_dir"], self.table_name()) self.output_file = open_fw(table_path, encoding=self.encoding) self.output_file.write("[") self.table_names.append((self.output_file, table_path)) self.auto_column_number = 1 # Register all tables created to enable # testing python files having custom download function if self.script.name not in self.script_table_registry: self.script_table_registry[self.script.name] = [] self.script_table_registry[self.script.name].append( (self.table_name(), self.table))
def create_table(self): """Create the table by creating an empty XML file.""" self.output_file = open_fw(self.table_name()) self.output_file.write(u'<?xml version="1.0" encoding="UTF-8"?>') self.output_file.write(u'\n<root>') self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1 # Register all tables created to enable # testing python files having custom download function if self.script.name not in self.script_table_registry: self.script_table_registry[self.script.name] = [] self.script_table_registry[self.script.name].append( (self.table_name(), self.table))
def sort_csv(filename, encoding=ENCODING): """Sort CSV rows minus the header and return the file. Function is used for only testing and can handle the file of the size. """ filename = os.path.normpath(filename) input_file = open_fr(filename, encoding) csv_reader_infile = csv.reader(input_file, escapechar="\\") # write the data to a temporary file and sort it temp_path = os.path.normpath("tempfile") temp_file = open_fw(temp_path, encoding) csv_writer = open_csvw(temp_file) i = 0 infields = None for row in csv_reader_infile: if i == 0: # The first entry is the header line infields = row i += 1 else: csv_writer.writerow(row) input_file.close() temp_file.close() # sort the temp file sorted_txt = sort_file(temp_path, encoding) tmp = open_fr(sorted_txt, encoding) in_txt = csv.reader(tmp, delimiter=',', escapechar="\\") csv_file = open_fw(filename, encoding) csv_writer = open_csvw(csv_file) csv_writer.writerow(infields) csv_writer.writerows(in_txt) tmp.close() csv_file.close() os.remove(os.path.normpath(temp_path)) return filename
def sort_file(file_path, encoding=ENCODING): """Sort file by line and return the file. Function is used for only testing and can handle the file of the size. """ file_path = os.path.normpath(file_path) input_file = open_fr(file_path, encoding) lines = [line.strip() for line in input_file] input_file.close() outfile = open_fw(file_path, encoding) lines.sort() for line in lines: outfile.write(line + "\n") outfile.close() return file_path
def create_table(self): """Create the table by creating an empty csv file""" self.auto_column_number = 1 self.file = open_fw(self.table_name()) self.output_file = open_csvw(self.file) column_list = self.table.get_insert_columns(join=False, create=True) self.output_file.writerow([u'{}'.format(val) for val in column_list]) self.table_names.append((self.file, self.table_name())) # Register all tables created to enable # testing python files having custom download function if self.script.name not in self.script_table_registry: self.script_table_registry[self.script.name] = [] self.script_table_registry[self.script.name].append( (self.table_name(), self.table))
def create_table(self): """Create the table by creating an empty json file""" table_path = os.path.join(self.opts["data_dir"], self.table_name()) self.output_file = open_fw(table_path) self.output_file.write("[") self.table_names.append((self.output_file, table_path)) self.auto_column_number = 1 # Register all tables created to enable # testing python files having custom download function if self.script.name not in self.script_table_registry: self.script_table_registry[self.script.name] = [] self.script_table_registry[self.script.name].append( (self.table_name(), self.table) )
def disconnect(self): """Close out the JSON with a `\\n]}` and close the file. Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a `\\n]}`. """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',\n') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.writelines(['\n]']) current_output_file.close() self.table_names = []
def disconnect(self): """Close out the JSON with a `\\n]}` and close the file. Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a `\\n]}`. """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',\n') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.writelines(['\n]']) current_output_file.close() self.table_names = []
def disconnect(self): """Close out the xml files Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a closing tag) """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.write(u'\n</root>') current_output_file.close() self.table_names = []
def to_csv(self, sort=True, path=None, select_columns=None, select_table=None): """Create a CSV file from the a data store. sort flag to create a sorted file, path to write the flag else write to the PWD, select_columns flag is used by large files to select columns data and has SELECT LIMIT 3. """ # Due to Cyclic imports we can not move this import to the top from retriever.lib.engine_tools import sort_csv for table_name in self.script_table_registry[self.script.name]: csv_file_output = os.path.normpath( os.path.join(path if path else '', table_name[0] + '.csv')) self.get_cursor() self.set_engine_encoding() csv_file = open_fw(csv_file_output, encoding=self.encoding) csv_writer = open_csvw(csv_file) limit = "" cols = "*" if select_columns: limit = "LIMIT 3" cols = ",".join(select_columns) sql_query = "SELECT {cols} FROM {tab} {limit};" self.cursor.execute( sql_query.format(cols=cols, tab=table_name[0], limit=limit)) row = self.cursor.fetchone() column_names = [ u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description ] csv_writer.writerow(column_names) while row is not None: csv_writer.writerow(row) row = self.cursor.fetchone() csv_file.close() if sort: sort_csv(csv_file_output) self.disconnect()
def convert_to_csv(dir_name): """Change the file delimiter to comma delimiter""" for file_name in os.listdir(dir_name): file_path = os.path.join(dir_name, file_name) if file_path.endswith(".txt"): csv_file_name = file_name.replace(".txt", ".csv").lower() output_file = os.path.join(dir_name, csv_file_name) with open_fr(file_path, encoding="latin-1") as read_object, open_fw( output_file) as outputfw: fr = csv.reader(read_object, delimiter="^", quotechar="~") fw = csv.writer(outputfw, delimiter=",", quoting=csv.QUOTE_MINIMAL) for line in fr: if line: fw.writerow(line) # delete the text files os.remove(file_path)
def to_csv(self): # Due to Cyclic imports we can not move this import to the top from retriever.lib.engine_tools import sort_csv for _ in list(self.script.urls.keys()): table_name = self.table_name() csv_file_output = os.path.normpath(table_name + '.csv') csv_file = open_fw(csv_file_output) csv_writer = open_csvw(csv_file) self.get_cursor() self.set_engine_encoding() self.cursor.execute("SELECT * FROM {};".format(table_name)) row = self.cursor.fetchone() colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description] csv_writer.writerow(colnames) while row is not None: csv_writer.writerow(row) row = self.cursor.fetchone() csv_file.close() sort_csv(csv_file_output) self.disconnect()
def to_csv(self, sort=True): # Due to Cyclic imports we can not move this import to the top from retriever.lib.engine_tools import sort_csv for table_n in list(self.script.tables.keys()): table_name = self.table_name(name=table_n) csv_file_output = os.path.normpath(table_name + '.csv') csv_file = open_fw(csv_file_output) csv_writer = open_csvw(csv_file) self.get_cursor() self.set_engine_encoding() self.cursor.execute("SELECT * FROM {};".format(table_name)) row = self.cursor.fetchone() colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description] csv_writer.writerow(colnames) while row is not None: csv_writer.writerow(row) row = self.cursor.fetchone() csv_file.close() if sort: sort_csv(csv_file_output) self.disconnect()
def json2csv(input_file, output_file=None, header_values=None, encoding=ENCODING, row_key=None): """Convert Json file to CSV.""" file_out = open_fr(input_file, encoding=encoding) # set output file name and write header if output_file is None: output_file = os.path.splitext( os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(output_file, encoding=encoding) if os.name == 'nt': outfile = csv.writer(csv_out, dialect='excel', escapechar="\\", lineterminator='\n') else: outfile = csv.writer(csv_out, dialect='excel', escapechar="\\") raw_data = json.loads(file_out.read(), object_pairs_hook=OrderedDict) raw_data, header_values = walker(raw_data, row_key=row_key, header_values=header_values, rows=[], normalize=False) if isinstance(raw_data[0], dict): # row values are in a list of dictionaries raw_data = [list(row.values()) for row in raw_data] else: raw_data = [row.tolist() for row in raw_data] if header_values: outfile.writerow(header_values) outfile.writerows(raw_data) file_out.close() subprocess.call(['rm', '-r', input_file]) return output_file
def download(self, engine=None, debug=False): Script.download(self, engine, debug) original_data = "Succession_sampling_03-07_data_original.txt" new_date = "Succession_sampling_03-07_data.txt" self.engine.download_file(self.urls["main"], original_data) data_path = self.engine.format_filename(new_date) old_data = open_fr(self.engine.find_file(original_data)) new_data = open_fw(data_path) # original file's header contains an end of line charactor # in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table(self.tables["main"], filename=new_date) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table(key, cleanup=self.cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def to_csv(self, sort=True, path=None, select_columns=None): """Create a CSV file from the a data store. sort flag to create a sorted file, path to write the flag else write to the PWD, select_columns flag is used by large files to select columns data and has SELECT LIMIT 3. """ # Due to Cyclic imports we can not move this import to the top from retriever.lib.engine_tools import sort_csv for table_name in self.script_table_registry[self.script.name]: csv_file_output = os.path.normpath(os.path.join(path if path else '', table_name[0] + '.csv')) csv_file = open_fw(csv_file_output) csv_writer = open_csvw(csv_file) self.get_cursor() self.set_engine_encoding() limit = "" cols = "*" if select_columns: limit = "LIMIT 3" cols = ",".join(select_columns) sql_query = "SELECT {cols} FROM {tab} {limit};" self.cursor.execute(sql_query.format(cols=cols, tab=table_name[0], limit=limit)) row = self.cursor.fetchone() column_names = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description] csv_writer.writerow(column_names) while row is not None: csv_writer.writerow(row) row = self.cursor.fetchone() csv_file.close() if sort: sort_csv(csv_file_output) self.disconnect()
def create_table(self): """Create the table by creating an empty json file""" self.output_file = open_fw(self.table_name()) self.output_file.write("[") self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["capture"], archive_type="zip") # Convert xlsx to csv. xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx") file_path = self.engine.format_filename("DSD_CAPTURE.csv") book = xlrd.open_workbook(xlsx_file) sh = book.sheet_by_index(0) rows = sh.nrows # Creating data files new_data = open_fw(file_path) csv_writer = open_csvw(new_data) csv_writer.writerow(["Order", "Concept_id", "Role_Type", "Codelist_id", "Codelist_Code_id", "Description"]) for index in range(2, rows): row = sh.row(index) # Get each row and format the sell value. # Data starts at index 2 row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) new_data.close() file_names = [ ('CL_FI_UNIT.csv', 'unit_data'), ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'), ('DSD_CAPTURE.csv', 'dsd_capture_data'), ('CL_FI_SPECIES_GROUPS.csv', 'species_group') ] for (filename, tablename) in file_names: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path) # File CL_FI_COUNTRY_GROUPS.csv has multi encoding file_names_encoded = [ ('CL_FI_COUNTRY_GROUPS.csv', 'country_groups'), ] for (filename, tablename) in file_names_encoded: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) table.columns = [('UN_Code', ('int', )), ('Identifier', ('int', )), ('ISO2_Code', ('char', '5')), ('ISO3_Code', ('char', '5')), ('Name_En', ('char', '50')), ('Name_Fr', ('char', '50')), ('Name_Es', ('char', '50')), ('Name_Ar', ('char', '120')), ('Name_Cn', ('char', '90')), ('Name_Ru', ('char', '150')), ('Official_Name_En', ('char', '70')), ('Official_Name_Fr', ('char', '70')), ('Official_Name_Es', ('char', '70')), ('Official_Name_Ar', ('char', '1100')), ('Official_Name_Cn', ('char', '70')), ('Official_Name_Ru', ('char', '130')), ('Continent_Group', ('char', '15')), ('EcoClass_Group', ('char', '50')), ('GeoRegion_Group', ('char', '30'))] self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path) # TS_FI_CAPTURE is file_names_encoded = [ ('TS_FI_CAPTURE.csv', 'ts_capture_data',) ] for (filename, tablename) in file_names_encoded: data_path = self.engine.format_filename(filename) table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table) table.columns = [('COUNTRY', ('int', )), ('FISHING_AREA', ('int', )), ('SPECIES', ('char', '10')), ('YEAR', ('int', )), ('UNIT', ('char', '5')), ('QUANTITY', ('double', )), ('SYMBOL', ('char', '4'))] self.engine.auto_create_table(table, filename=filename) self.engine.insert_data_from_file(data_path)
from retriever.lib.defaults import ENCODING encoding = ENCODING.lower() from retriever.lib.defaults import VERSION, COPYRIGHT from retriever.lib.scripts import SCRIPT_LIST, reload_scripts from retriever.lib.tools import open_fw from retriever.lib.repository import check_for_updates def to_str(object, object_encoding=encoding): return str(object).encode('UTF-8').decode(encoding) # Create the .rst file for the available datasets datasetfile = open_fw("datasets_list.rst") datasetfile_title = """================== Datasets Available ================== """ check_for_updates() reload_scripts() script_list = SCRIPT_LIST() # write the title of dataset rst file # ref:http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html datasetfile.write(datasetfile_title) # get info from the scripts using specified encoding
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o" } newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int", )), ("regioncode", ("int", )), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns = [("record_id", ("pk-auto", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("Year", ("int", )), ("Aou", ("int", )), ("Count10", ("int", )), ("Count20", ("int", )), ("Count30", ("int", )), ("Count40", ("int", )), ("Count50", ("int", )), ("StopTotal", ("int", )), ("SpeciesTotal", ("int", ))] stateslist = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon" ] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = ["observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [("record_id", ("pk-auto",)), ("observation_id", ("int",)), # subsequently refered to as "status record" ("update_datetime", ("char",)), ("site_id", ("int",)), ("latitude", ("double",)), ("longitude", ("double",)), ("elevation_in_meters", ("char",)), ("state", ("char",)), ("species_id", ("int",)), ("genus", ("char",)), ("species", ("char",)), ("common_name", ("char",)), ("kingdom", ("char",)), # skip kingdom ("individual_id", ("char",)), ("phenophase_id", ("int",)), ("phenophase_description", ("char",)), ("observation_date", ("char",)), ("day_of_year", ("char",)), ("phenophase_status", ("char",)), ("intensity_category_id", ("char",)), ("intensity_value", ("char",)), ("abundance_value", ("char",)) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # Complete Plants Checklist file_name = "complete_plant_checklist.csv" table_name = "complete_plant_checklist" complete_plant_url = "https://plants.sc.egov.usda.gov/java/downloadData?fileName=plantlst.txt&static=true" self.engine.download_file(complete_plant_url, filename=file_name) data_path = self.engine.format_filename(file_name) table = Table(table_name, delimiter=",") table.columns = [ ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("common_name", ("char", "42")), ("family", ("char", "30")), ] self.engine.auto_create_table(table, filename=file_name) self.engine.insert_data_from_file(data_path) # Symbols for Unknown Plants file_name = "symbols_unknown_plants.csv" table_name = "unknown_plants" unknown_plants_url = "https://plants.sc.egov.usda.gov/Data/unknown_plants.txt" self.engine.download_file(unknown_plants_url, filename=file_name) data_path = self.engine.format_filename(file_name) table = Table(table_name, delimiter=",") table.columns = [("symbol", ("char", "7")), ("common_name", ("char", "56"))] self.engine.auto_create_table(table, filename=file_name) self.engine.insert_data_from_file(data_path) # State PLANTS Checklist base_url = "https://plants.sc.egov.usda.gov/" state_plant_checklist_base_url = "{base}java/stateDownload?statefips={id}" state_plant_checklist_file = "all_state_plant_checklist.csv" table_name = "state_plant_checklist" state_plant_checklist = [ ("US01", "Alabama", "US"), ("US02", "Alaska", "US"), ("US05", "Arkansas", "US"), ("US04", "Arizona", "US"), ("US06", "California", "US"), ("US08", "Colorado", "US"), ("US09", "Connecticut", "US"), ("US10", "Delaware", "US"), ("US11", "District of Columbia", "US"), ("US12", "Florida", "US"), ("US13", "Georgia", "US"), ("US15", "Hawaii", "US"), ("US16", "Idaho", "US"), ("US17", "Illinois", "US"), ("US18", "Indiana", "US"), ("US19", "Iowa", "US"), ("US20", "Kansas", "US"), ("US21", "Kentucky", "US"), ("US22", "Louisiana", "US"), ("US23", "Maine", "US"), ("US24", "Maryland", "US"), ("US25", "Massachusetts", "US"), ("US26", "Michigan", "US"), ("US27", "Minnesota", "US"), ("US28", "Mississippi", "US"), ("US29", "Missouri", "US"), ("US30", "Montana", "US"), ("US31", "Nebraska", "US"), ("US32", "Nevada", "US"), ("US33", "New Hampshire", "US"), ("US34", "New Jersey", "US"), ("US35", "New Mexico", "US"), ("US36", "New York", "US"), ("US37", "North Carolina", "US"), ("US38", "North Dakota", "US"), ("US39", "Ohio", "US"), ("US40", "Oklahoma", "US"), ("US41", "Oregon", "US"), ("US42", "Pennsylvania", "US"), ("US44", "Rhode Island", "US"), ("US45", "South Carolina", "US"), ("US46", "South Dakota", "US"), ("US47", "Tennessee", "US"), ("US48", "Texas", "US"), ("US49", "Utah", "US"), ("US50", "Vermont", "US"), ("US51", "Virginia", "US"), ("US53", "Washington", "US"), ("US54", "West Virginia", "US"), ("US55", "Wisconsin", "US"), ("US56", "Wyoming", "US"), ("US72", "Puerto Rico", "US"), ("US78", "Virgin Islands", "US"), ("CA01", "Alberta", "Canada"), ("CA02", "British Columbia", "Canada"), ("CA03", "Manitoba", "Canada"), ("CA04", "New Brunswick", "Canada"), ("CALB", "Labrador", "Canada"), ("CANF", "Newfoundland", "Canada"), ("CA13", "Northwest Territories", "Canada"), ("CA07", "Nova Scotia", "Canada"), ("CA14", "Nunavut", "Canada"), ("CA08", "Ontario", "Canada"), ("CA09", "Prince Edward Island", "Canada"), ("CA10", "Québec", "Canada"), ("CA11", "Saskatchewan", "Canada"), ("CA12", "Yukon", "Canada"), ("GL", "Greenland", "Denmark"), ("SB", "St. Pierre and Miquelon", "France"), ] with open_fw(engine.format_filename( state_plant_checklist_file)) as write_object: csv_writer = open_csvw(write_object) for state_info in state_plant_checklist: file_name = state_info[1].replace(".", "").replace( " ", "_").lower() + ".csv" file_name = "old_state_plant_checklist_" + file_name state_url = state_plant_checklist_base_url.format( base=base_url, id=state_info[0]) self.engine.download_file(state_url, filename=file_name) with open_fr(engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) for row in csv.reader(read_object, delimiter=","): csv_writer.writerow([state_info[2]] + [state_info[1]] + row) data_path = self.engine.format_filename(state_plant_checklist_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("country", ("char", "7")), ("state", ("char", "23")), ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("national_common_name", ("char", "42")), ("family", ("char", "17")), ] self.engine.auto_create_table(table, filename=state_plant_checklist_file) self.engine.insert_data_from_file(data_path) # NRCS State GSAT Lists base_url = "https://www.plants.usda.gov/" nrcs_state_gsat_base_url = "{base}java/gsatDownload?gsatid={id}" nrcs_state_gsat_file = "all_nrcs_state_gsat.csv" table_name = "nrcs_state_gsat" nrcs_state_gsat = [ ("Alabama", "2"), ("Alaska", ""), ("Arkansas", ""), ("Arizona", "2"), ("California", ""), ("Colorado", ""), ("Connecticut", ""), ("Delaware", ""), ("Florida", ""), ("Georgia", ""), ("Hawaii", ""), ("Idaho", "9"), ("Illinois", ""), ("Indiana", ""), ("Iowa ", ""), ("Kansas", "6"), ("Kentucky", ""), ("Louisiana", "16"), ("Maine", ""), ("Maryland", ""), ("Massachusetts", ""), ("Michigan", ""), ("Minnesota", "11"), ("Mississippi", ""), ("Missouri", "14"), ("Montana", ""), ("Nebraska", "17"), ("Nevada", "4"), ("New Hampshire", ""), ("New Jersey ", ""), ("New Mexico", "1"), ("New York", ""), ("Noth Carolina", ""), ("North Dakota", "5"), ("Ohio", ""), ("Oklahoma", "12"), ("Oregon", "3"), ("Pennsylvania", "15"), ("Rhode Island", ""), ("South Carolina", ""), ("South Dakota", "7"), ("Tennessee", ""), ("Texas", "13"), ("Utah", ""), ("Vermont ", ""), ("Virginia", ""), ("Washington", "8"), ("West Virginia", ""), ("Wisconsin", ""), ("Wyoming", "10"), ] with open_fw( engine.format_filename(nrcs_state_gsat_file)) as write_object: for state_info in nrcs_state_gsat: if state_info[1]: # skip states with no data ("state", ""), file_name = state_info[0].replace(" ", "_").replace( ".", "").lower() + ".csv" file_name = "old_nrcs_state_gsat_" + file_name state_url = nrcs_state_gsat_base_url.format( base=base_url, id=state_info[1]) self.engine.download_file(state_url, filename=file_name) with open_fr( engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) state_quoted = '"{state}",'.format(state=state_info[0]) for line in read_object: write_object.write(state_quoted + line) data_path = self.engine.format_filename(nrcs_state_gsat_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("state", ("char", "12")), ("symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("gsat_common_name", ("char", "93")), ] self.engine.auto_create_table(table, filename=nrcs_state_gsat_file) self.engine.insert_data_from_file(data_path) base_url = "https://plants.sc.egov.usda.gov/" nrcs_state_plant_lists_url = "{base}java/nrcsStateDownload?statefips={id}" nrcs_state_plant_file = "all_nrcs_state_plant.csv" table_name = "nrcs_state_plant" nrcs_state_plant_lists = [ ("01", "Alabama"), ("02", "Alaska"), ("05", "Arkansas"), ("04", "Arizona"), ("06", "California"), ("08", "Colorado"), ("09", "Connecticut"), ("10", "Delaware"), ("12", "Florida"), ("13", "Georgia"), ("15", "Hawaii"), ("16", "Idaho"), ("17", "Illinois"), ("18", "Indiana"), ("19", "Iowa"), ("20", "Kansas"), ("21", "Kentucky"), ("22", "Louisiana"), ("23", "Maine"), ("24", "Maryland"), ("25", "Massachusetts"), ("26", "Michigan"), ("27", "Minnesota"), ("28", "Mississippi"), ("29", "Missouri"), ("30", "Montana"), ("31", "Nebraska"), ("32", "Nevada"), ("33", "New Hampshire"), ("34", "New Jersey"), ("35", "New Mexico"), ("36", "New York"), ("37", "North Carolina"), ("38", "North Dakota"), ("39", "Ohio"), ("40", "Oklahoma"), ("41", "Oregon"), ("42", "Pennsylvania"), ("44", "Rhode Island"), ("45", "South Carolina"), ("46", "South Dakota"), ("47", "Tennessee"), ("48", "Texas"), ("49", "Utah"), ("50", "Vermont"), ("51", "Virginia"), ("53", "Washington"), ("54", "West Virginia"), ("55", "Wisconsin"), ("56", "Wyoming"), ("72", "Puerto Rico"), ("78", "Virgin Islands"), ] with open_fw( engine.format_filename(nrcs_state_plant_file)) as write_object: for state_info in nrcs_state_plant_lists: file_name = state_info[1].replace(" ", "_").replace( ".", "").lower() + ".csv" file_name = "old_nrcs_state_plant_" + file_name state_url = nrcs_state_plant_lists_url.format(base=base_url, id=state_info[0]) self.engine.download_file(state_url, filename=file_name) with open_fr(engine.format_filename(file_name)) as read_object: # Read state file and only write the data minus header next(read_object) state_quoted = '"{state}",'.format(state=state_info[1]) for line in read_object: write_object.write(state_quoted + line) data_path = self.engine.format_filename(nrcs_state_plant_file) table = Table(table_name, delimiter=",", header_rows=0) table.columns = [ ("state", ("char", "17")), ("symbol", ("char", "7")), ("synonym_symbol", ("char", "7")), ("scientific_name_with_author", ("char", "183")), ("state_common_name", ("char", "42")), ("family", ("char", "17")), ] self.engine.auto_create_table(table, filename=nrcs_state_plant_file) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o" } newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int", )), ("regioncode", ("int", )), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns = [("RouteDataID", ("int", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("year", ("int", )), ("AOU", ("int", )), ("Stop1", ("int", )), ("Stop2", ("int", )), ("Stop3", ("int", )), ("Stop4", ("int", )), ("Stop5", ("int", )), ("Stop6", ("int", )), ("Stop7", ("int", )), ("Stop8", ("int", )), ("Stop9", ("int", )), ("Stop10", ("int", )), ("Stop11", ("int", )), ("Stop12", ("int", )), ("Stop13", ("int", )), ("Stop14", ("int", )), ("Stop15", ("int", )), ("Stop16", ("int", )), ("Stop17", ("int", )), ("Stop18", ("int", )), ("Stop19", ("int", )), ("Stop20", ("int", )), ("Stop21", ("int", )), ("Stop22", ("int", )), ("Stop23", ("int", )), ("Stop24", ("int", )), ("Stop25", ("int", )), ("Stop26", ("int", )), ("Stop27", ("int", )), ("Stop28", ("int", )), ("Stop29", ("int", )), ("Stop30", ("int", )), ("Stop31", ("int", )), ("Stop32", ("int", )), ("Stop33", ("int", )), ("Stop34", ("int", )), ("Stop35", ("int", )), ("Stop36", ("int", )), ("Stop37", ("int", )), ("Stop38", ("int", )), ("Stop39", ("int", )), ("Stop40", ("int", )), ("Stop41", ("int", )), ("Stop42", ("int", )), ("Stop43", ("int", )), ("Stop44", ("int", )), ("Stop45", ("int", )), ("Stop46", ("int", )), ("Stop47", ("int", )), ("Stop48", ("int", )), ("Stop49", ("int", )), ("Stop50", ("int", ))] part = "" engine.table = table engine.create_table() for part in range(1, 11): part = str(part) try: print("Inserting data from part " + part + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("Failed bulk insert on " + part + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("There was an error in part " + part + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int",)), ("Family", ("char",)), ("Binomial", ("char",)), ("Wood_Density", ("double",)), ("Region", ("char",)), ("Reference_Number", ("int",))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # download and create species table table = Table('species') self.engine.auto_create_table(table, url=self.urls['species']) self.engine.insert_data_from_url(self.urls['species']) # State abbreviations with the year annual inventory began for that state stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000), ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004), ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001), ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999), ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003), ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006), ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002), ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003), ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001), ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001), ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003), ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000), ('WY', 2000), ('PR', 2001)] tablelist = [ "SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE", "SEEDLING" ] for table in tablelist: for state, year in stateslist: engine.download_files_from_archive( self.urls["main"] + state + "_" + table + ".ZIP", [state + "_" + table + ".csv"]) for table in tablelist: print("Scanning data for table %s..." % table) prep_file_name = "%s.csv" % table prep_file = open_fw(engine.format_filename(prep_file_name)) this_file = open_fr( engine.format_filename(stateslist[0][0] + "_" + table + ".csv")) col_names = this_file.readline() prep_file.write(col_names) column_names = [col.strip('"') for col in col_names.split(',')] year_column = column_names.index("INVYR") this_file.close() for state, year in stateslist: this_file = open_fr( engine.format_filename(state + "_" + table + ".csv")) this_file.readline() for line in this_file: values = line.split(',') this_year = values[year_column] if int(this_year) >= year: prep_file.write(line) prep_file.close() engine.auto_create_table(Table(table), filename=prep_file_name) engine.insert_data_from_file( engine.format_filename(prep_file_name)) try: os.remove(engine.format_filename(prep_file_name)) except: pass return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [("species_id", ("pk-int",)), ("AOU", ("int",)), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive( self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int",)), ("regioncode", ("int",)), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns = [("RouteDataID", ("int",)), ("countrynum", ("int",)), ("statenum", ("int",)), ("Route", ("int",)), ("RPID", ("int",)), ("year", ("int",)), ("AOU", ("int",)), ("Stop1", ("int",)), ("Stop2", ("int",)), ("Stop3", ("int",)), ("Stop4", ("int",)), ("Stop5", ("int",)), ("Stop6", ("int",)), ("Stop7", ("int",)), ("Stop8", ("int",)), ("Stop9", ("int",)), ("Stop10", ("int",)), ("Stop11", ("int",)), ("Stop12", ("int",)), ("Stop13", ("int",)), ("Stop14", ("int",)), ("Stop15", ("int",)), ("Stop16", ("int",)), ("Stop17", ("int",)), ("Stop18", ("int",)), ("Stop19", ("int",)), ("Stop20", ("int",)), ("Stop21", ("int",)), ("Stop22", ("int",)), ("Stop23", ("int",)), ("Stop24", ("int",)), ("Stop25", ("int",)), ("Stop26", ("int",)), ("Stop27", ("int",)), ("Stop28", ("int",)), ("Stop29", ("int",)), ("Stop30", ("int",)), ("Stop31", ("int",)), ("Stop32", ("int",)), ("Stop33", ("int",)), ("Stop34", ("int",)), ("Stop35", ("int",)), ("Stop36", ("int",)), ("Stop37", ("int",)), ("Stop38", ("int",)), ("Stop39", ("int",)), ("Stop40", ("int",)), ("Stop41", ("int",)), ("Stop42", ("int",)), ("Stop43", ("int",)), ("Stop44", ("int",)), ("Stop45", ("int",)), ("Stop46", ("int",)), ("Stop47", ("int",)), ("Stop48", ("int",)), ("Stop49", ("int",)), ("Stop50", ("int",))] part = "" engine.table = table engine.create_table() for part in range(1, 11): part = str(part) try: print("Inserting data from part " + part + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print( "Failed bulk insert on " + part + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("There was an error in part " + part + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow([ "Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number" ]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int", )), ("Family", ("char", )), ("Binomial", ("char", )), ("Wood_Density", ("double", )), ("Region", ("char", )), ("Reference_Number", ("int", ))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [ to_str(column_value.value, object_encoding=sys.stdout) for column_value in row ] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int", )), ("Reference", ("char", ))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [("species_id", ("pk-int",)), ("AOU", ("int",)), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int",)), ("regioncode", ("int",)), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns = [("record_id", ("pk-auto",)), ("RouteDataID", ("int",)), ("countrynum", ("int",)), ("statenum", ("int",)), ("Route", ("int",)), ("RPID", ("int",)), ("Year", ("int",)), ("Aou", ("int",)), ("Count10", ("int",)), ("Count20", ("int",)), ("Count30", ("int",)), ("Count40", ("int",)), ("Count50", ("int",)), ("StopTotal", ("int",)), ("SpeciesTotal", ("int",))] stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if isinstance(state, (list,)): state, shortstate = state[0], state[1] else: shortstate = state[0:7] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print( "Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
# sys removes the setdefaultencoding method at startup; reload to get it back reload(sys) if hasattr(sys, 'setdefaultencoding'): # set default encoding to latin-1 to decode source text sys.setdefaultencoding('latin-1') def to_str(object, object_encoding=encoding): if sys.version_info >= (3, 0, 0): return str(object).encode('UTF-8').decode(encoding) return object # Create the .rst file for the available datasets datasetfile = open_fw("datasets_list.rst") datasetfile_title = """================== Datasets Available ================== """ check_for_updates() reload_scripts() script_list = SCRIPT_LIST() # write the title of dataset rst file # ref:http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html datasetfile.write(datasetfile_title) # get info from the scripts using specified encoding
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = [ "observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [ ("record_id", ("pk-auto", )), ("observation_id", ("int", )), # subsequently refered to as "status record" ("update_datetime", ("char", )), ("site_id", ("int", )), ("latitude", ("double", )), ("longitude", ("double", )), ("elevation_in_meters", ("char", )), ("state", ("char", )), ("species_id", ("int", )), ("genus", ("char", )), ("species", ("char", )), ("common_name", ("char", )), ("kingdom", ("char", )), # skip kingdom ("individual_id", ("char", )), ("phenophase_id", ("int", )), ("phenophase_description", ("char", )), ("observation_date", ("char", )), ("day_of_year", ("char", )), ("phenophase_status", ("char", )), ("intensity_category_id", ("char", )), ("intensity_value", ("char", )), ("abundance_value", ("char", )) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def create_table(self): """Create the table by creating an empty json file""" self.output_file = open_fw(self.table_name()) self.output_file.write("[") self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1