def sort_csv(filename): """Sort CSV rows minus the header and return the file function is used for only testing and can handle the file of the size """ filename = os.path.normpath(filename) input_file = open_fr(filename) csv_reader_infile = csv.reader(input_file, escapechar="\\") # write the data to a temporary file and sort it temp_path = os.path.normpath("tempfile") temp_file = open_fw(temp_path) csv_writer = open_csvw(temp_file) i = 0 for row in csv_reader_infile: if i == 0: # The first entry is the header line infields = row i += 1 else: csv_writer.writerow(row) input_file.close() temp_file.close() # sort the temp file sorted_txt = sort_file(temp_path) tmp = open_fr(sorted_txt) in_txt = csv.reader(tmp, delimiter=',', escapechar="\\") csv_file = open_fw(filename) csv_writer = open_csvw(csv_file) csv_writer.writerow(infields) csv_writer.writerows(in_txt) tmp.close() csv_file.close() os.remove(os.path.normpath(temp_path)) return filename
def json2csv(input_file, output_file=None, header_values=None): """Convert Json file to CSV function is used for only testing and can handle the file of the size """ file_out = open_fr(input_file, encode=False) # set output file name and write header if output_file is None: output_file = os.path.splitext( os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(output_file, encode=False) if os.name == 'nt': outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", lineterminator='\n', fieldnames=header_values) else: outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", fieldnames=header_values) raw_data = json.loads(file_out.read()) outfile.writeheader() for item in raw_data: outfile.writerow(item) file_out.close() os.system("rm -r {}".format(input_file)) return output_file
def create_table(self): """Create the table by creating an empty XML file""" self.output_file = open_fw(self.table_name()) self.output_file.write(u'<?xml version="1.0" encoding="UTF-8"?>') self.output_file.write(u'\n<root>') self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file( self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename( "Succession_sampling_03-07_data.txt") old_data = open_fr( self.engine.find_file( "Succession_sampling_03-07_data_original.txt")) new_data = open_fw(data_path) # original file's header contains an end of line charactor in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table( self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def create_table(self): """Create the table by creating an empty csv file""" self.auto_column_number = 1 self.file = open_fw(self.table_name()) self.output_file = open_csvw(self.file) self.output_file.writerow([u'{}'.format(val) for val in self.table.get_insert_columns(join=False,create=True)]) self.table_names.append((self.file, self.table_name()))
def xml2csv(input_file, outputfile=None, header_values=None, row_tag="row"): """Convert xml to csv function is used for only testing and can handle the file of the size """ file_output = open_fr(input_file, encode=False) # set output file name and write header if outputfile is None: outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(outputfile) if os.name == 'nt': csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\', lineterminator='\n') else: csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\') v = file_output.read() csv_writer.writerow(header_values) tree = ET.parse(newfile(v)) root = tree.getroot() for rows in root.findall(row_tag): x = [ name.text for name in header_values for name in rows.findall(name) ] csv_writer.writerow(x) file_output.close() os.system("rm -r {}".format(input_file)) return outputfile
def sort_file(file_path): """Sort file by line and return the file function is used for only testing and can handle the file of the size """ file_path = os.path.normpath(file_path) input_file = open_fr(file_path) lines = [line.strip().replace('\x00', '') for line in input_file] input_file.close() outfile = open_fw(file_path) lines.sort() for line in lines: outfile.write(line + "\n") outfile.close() return file_path
def disconnect(self): """Close out the xml files Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a closing tag) """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.write(u'\n</root>') current_output_file.close() self.table_names = []
def disconnect(self): """Close out the JSON with a `\\n]}` and close the file. Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a `\\n]}`. """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',\n') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.writelines(['\n]']) current_output_file.close() self.table_names = []
def disconnect(self): """Close out the xml files Close all the file objects that have been created Re-write the files stripping off the last comma and then close with a closing tag) """ if self.table_names: for output_file_i, file_name in self.table_names: output_file_i.close() current_input_file = open_fr(file_name, encode=False) file_contents = current_input_file.readlines() current_input_file.close() file_contents[-1] = file_contents[-1].strip(',') current_output_file = open_fw(file_name) current_output_file.writelines(file_contents) current_output_file.write(u'\n</root>') current_output_file.close() self.table_names = []
def to_csv(self): # due to Cyclic imports we can not move this import to the top from retriever.lib.tools import sort_csv for item in list(self.script.urls.keys()): table_name = self.table_name() csv_file_output = os.path.normpath(table_name + '.csv') csv_file = open_fw(csv_file_output) csv_writer = open_csvw(csv_file) self.get_cursor() self.set_engine_encoding() self.cursor.execute("SELECT * FROM {};".format(table_name)) row = self.cursor.fetchone() colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description] csv_writer.writerow(colnames) while row is not None: csv_writer.writerow(row) row = self.cursor.fetchone() csv_file.close() sort_csv(csv_file_output) self.disconnect()
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename("Succession_sampling_03-07_data.txt") old_data = open_fr(self.engine.find_file("Succession_sampling_03-07_data_original.txt")) new_data = open_fw(data_path) # original file's header contains an end of line charactor in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table(self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def json2csv(input_file, output_file=None, header_values=None): """Convert Json file to CSV function is used for only testing and can handle the file of the size """ file_out = open_fr(input_file) # set output file name and write header if output_file is None: output_file = os.path.splitext(os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(output_file, encode=False) if os.name == 'nt': outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", lineterminator='\n', fieldnames=header_values) else: outfile = csv.DictWriter(csv_out, dialect='excel', escapechar="\\", fieldnames=header_values) raw_data = json.loads(file_out.read()) outfile.writeheader() for item in raw_data: outfile.writerow(item) file_out.close() os.system("rm -r {}".format(input_file)) return output_file
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table(key, cleanup=cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table(key, cleanup=Cleanup(correct_invalid_value, nulls=[-999.9])),filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def xml2csv(input_file, outputfile=None, header_values=None, row_tag="row"): """Convert xml to csv function is used for only testing and can handle the file of the size """ file_output = open_fr(input_file, encode=False) # set output file name and write header if outputfile is None: outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv" csv_out = open_fw(outputfile) if os.name == 'nt': csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\', lineterminator='\n') else: csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\') v = file_output.read() csv_writer.writerow(header_values) tree = ET.parse(newfile(v)) root = tree.getroot() for rows in root.findall(row_tag): x = [name.text for name in header_values for name in rows.findall(name)] csv_writer.writerow(x) file_output.close() os.system("rm -r {}".format(input_file)) return outputfile
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns=[("RouteDataID" , ("int",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("year" , ("int",) ), ("AOU" , ("int",) ), ("Stop1" , ("int",) ), ("Stop2" , ("int",) ), ("Stop3" , ("int",) ), ("Stop4" , ("int",) ), ("Stop5" , ("int",) ), ("Stop6" , ("int",) ), ("Stop7" , ("int",) ), ("Stop8" , ("int",) ), ("Stop9" , ("int",) ), ("Stop10" , ("int",) ), ("Stop11" , ("int",) ), ("Stop12" , ("int",) ), ("Stop13" , ("int",) ), ("Stop14" , ("int",) ), ("Stop15" , ("int",) ), ("Stop16" , ("int",) ), ("Stop17" , ("int",) ), ("Stop18" , ("int",) ), ("Stop19" , ("int",) ), ("Stop20" , ("int",) ), ("Stop21" , ("int",) ), ("Stop22" , ("int",) ), ("Stop23" , ("int",) ), ("Stop24" , ("int",) ), ("Stop25" , ("int",) ), ("Stop26" , ("int",) ), ("Stop27" , ("int",) ), ("Stop28" , ("int",) ), ("Stop29" , ("int",) ), ("Stop30" , ("int",) ), ("Stop31" , ("int",) ), ("Stop32" , ("int",) ), ("Stop33" , ("int",) ), ("Stop34" , ("int",) ), ("Stop35" , ("int",) ), ("Stop36" , ("int",) ), ("Stop37" , ("int",) ), ("Stop38" , ("int",) ), ("Stop39" , ("int",) ), ("Stop40" , ("int",) ), ("Stop41" , ("int",) ), ("Stop42" , ("int",) ), ("Stop43" , ("int",) ), ("Stop44" , ("int",) ), ("Stop45" , ("int",) ), ("Stop46" , ("int",) ), ("Stop47" , ("int",) ), ("Stop48" , ("int",) ), ("Stop49" , ("int",) ), ("Stop50" , ("int",) )] part = "" engine.table = table engine.create_table() for part in range(1,11): part = str(part) try: print("Inserting data from part " + part + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("Failed bulk insert on " + part + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print("There was an error in part " + part + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # download and create species table table = Table('species') self.engine.auto_create_table(table, url=self.urls['species']) self.engine.insert_data_from_url(self.urls['species']) # State abbreviations with the year annual inventory began for that state stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000), ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004), ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001), ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999), ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003), ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006), ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002), ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003), ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001), ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001), ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003), ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000), ('WY', 2000), ('PR', 2001)] tablelist = ["SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE", "SEEDLING"] for table in tablelist: for state, year in stateslist: engine.download_files_from_archive(self.urls["main"] + state + "_" + table + ".ZIP", [state + "_" + table + ".csv"]) for table in tablelist: print("Scanning data for table %s..." % table) prep_file_name = "%s.csv" % table prep_file = open_fw(engine.format_filename(prep_file_name)) this_file = open_fr(engine.format_filename(stateslist[0][0] + "_" + table + ".csv")) col_names = this_file.readline() prep_file.write(col_names) column_names = [col.strip('"') for col in col_names.split(',')] year_column = column_names.index("INVYR") this_file.close() for state, year in stateslist: this_file = open_fr(engine.format_filename(state + "_" + table + ".csv")) this_file.readline() for line in this_file: values = line.split(',') this_year = values[year_column] if int(this_year) >= year: prep_file.write(line) prep_file.close() engine.auto_create_table(Table(table), filename=prep_file_name) engine.insert_data_from_file(engine.format_filename(prep_file_name)) try: os.remove(engine.format_filename(prep_file_name)) except: pass return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns=[("record_id" , ("pk-auto",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("Year" , ("int",) ), ("Aou" , ("int",) ), ("Count10" , ("int",) ), ("Count20" , ("int",) ), ("Count30" , ("int",) ), ("Count40" , ("int",) ), ("Count50" , ("int",) ), ("StopTotal" , ("int",) ), ("SpeciesTotal" , ("int",) )] stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int",)), ("Family", ("char",)), ("Binomial", ("char",)), ("Wood_Density", ("double",)), ("Region", ("char",)), ("Reference_Number", ("int",))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = ["observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [("record_id", ("pk-auto",)), ("observation_id", ("int",)), # subsequently refered to as "status record" ("update_datetime", ("char",)), ("site_id", ("int",)), ("latitude", ("double",)), ("longitude", ("double",)), ("elevation_in_meters", ("char",)), ("state", ("char",)), ("species_id", ("int",)), ("genus", ("char",)), ("species", ("char",)), ("common_name", ("char",)), ("kingdom", ("char",)), # skip kingdom ("individual_id", ("char",)), ("phenophase_id", ("int",)), ("phenophase_description", ("char",)), ("observation_date", ("char",)), ("day_of_year", ("char",)), ("phenophase_status", ("char",)), ("intensity_category_id", ("char",)), ("intensity_value", ("char",)), ("abundance_value", ("char",)) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def create_table(self): """Create the table by creating an empty json file""" self.output_file = open_fw(self.table_name()) self.output_file.write("[") self.table_names.append((self.output_file, self.table_name())) self.auto_column_number = 1
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = [ "observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [ ("record_id", ("pk-auto", )), ("observation_id", ("int", )), # subsequently refered to as "status record" ("update_datetime", ("char", )), ("site_id", ("int", )), ("latitude", ("double", )), ("longitude", ("double", )), ("elevation_in_meters", ("char", )), ("state", ("char", )), ("species_id", ("int", )), ("genus", ("char", )), ("species", ("char", )), ("common_name", ("char", )), ("kingdom", ("char", )), # skip kingdom ("individual_id", ("char", )), ("phenophase_id", ("int", )), ("phenophase_description", ("char", )), ("observation_date", ("char", )), ("day_of_year", ("char", )), ("phenophase_status", ("char", )), ("intensity_category_id", ("char", )), ("intensity_value", ("char", )), ("abundance_value", ("char", )) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow([ "Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number" ]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int", )), ("Family", ("char", )), ("Binomial", ("char", )), ("Wood_Density", ("double", )), ("Region", ("char", )), ("Reference_Number", ("int", ))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [ to_str(column_value.value, object_encoding=sys.stdout) for column_value in row ] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int", )), ("Reference", ("char", ))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # download and create species table table = Table('species') self.engine.auto_create_table(table, url=self.urls['species']) self.engine.insert_data_from_url(self.urls['species']) # State abbreviations with the year annual inventory began for that state stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000), ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004), ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001), ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999), ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003), ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006), ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002), ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003), ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001), ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001), ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003), ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000), ('WY', 2000), ('PR', 2001)] tablelist = [ "SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE", "SEEDLING" ] for table in tablelist: for state, year in stateslist: engine.download_files_from_archive( self.urls["main"] + state + "_" + table + ".ZIP", [state + "_" + table + ".csv"]) for table in tablelist: print("Scanning data for table %s..." % table) prep_file_name = "%s.csv" % table prep_file = open_fw(engine.format_filename(prep_file_name)) this_file = open_fr( engine.format_filename(stateslist[0][0] + "_" + table + ".csv")) col_names = this_file.readline() prep_file.write(col_names) column_names = [col.strip('"') for col in col_names.split(',')] year_column = column_names.index("INVYR") this_file.close() for state, year in stateslist: this_file = open_fr( engine.format_filename(state + "_" + table + ".csv")) this_file.readline() for line in this_file: values = line.split(',') this_year = values[year_column] if int(this_year) >= year: prep_file.write(line) prep_file.close() engine.auto_create_table(Table(table), filename=prep_file_name) engine.insert_data_from_file( engine.format_filename(prep_file_name)) try: os.remove(engine.format_filename(prep_file_name)) except: pass return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup( correct_invalid_value, nulls=['NULL'])), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o" } newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int", )), ("regioncode", ("int", )), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns = [("record_id", ("pk-auto", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("Year", ("int", )), ("Aou", ("int", )), ("Count10", ("int", )), ("Count20", ("int", )), ("Count30", ("int", )), ("Count40", ("int", )), ("Count50", ("int", )), ("StopTotal", ("int", )), ("SpeciesTotal", ("int", ))] stateslist = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon" ] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine