def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # download and create species table table = Table('species') self.engine.auto_create_table(table, url=self.urls['species']) self.engine.insert_data_from_url(self.urls['species']) # State abbreviations with the year annual inventory began for that state stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000), ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004), ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001), ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999), ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003), ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002), ('NJ', 2004), ('NY', 2002), ('NC', 2003), ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001), ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001), ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003), ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000), ('PR', 2001)] tablelist = ["SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE", "SEEDLING"] for table in tablelist: for state, year in stateslist: engine.download_files_from_archive(self.urls["main"] + state + "_" + table + ".ZIP", [state + "_" + table + ".CSV"]) for table in tablelist: print "Scanning data for table %s..." % table prep_file_name = "%s.csv" % table prep_file = open(engine.format_filename(prep_file_name), "wb") this_file = open(engine.format_filename(stateslist[0][0] + "_" + table + ".CSV"), "rb") col_names = this_file.readline() prep_file.write(col_names) column_names = [col.strip('"') for col in col_names.split(',')] year_column = column_names.index("INVYR") this_file.close() for state, year in stateslist: this_file = open(engine.format_filename(state + "_" + table + ".CSV"), "rb") this_file.readline() for line in this_file: values = line.split(',') this_year = values[year_column] if int(this_year) >= year: prep_file.write(line) prep_file.close() engine.auto_create_table(Table(table), filename=prep_file_name) engine.insert_data_from_file(engine.format_filename(prep_file_name)) try: os.remove(engine.format_filename(prep_file_name)) except: pass return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["trees"], "LS_trees_1983_2000.txt") data_path = self.engine.format_filename("LS_trees_1983_2000.txt") self.engine.auto_create_table(self.tables["trees"], filename="LS_trees_1983_2000.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file( self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename( "Succession_sampling_03-07_data.txt") old_data = open( self.engine.find_file( "Succession_sampling_03-07_data_original.txt"), 'rb') new_data = open(data_path, 'wb') line1 = old_data.readline() line2 = old_data.readline() newline = line1.replace("\n", "\t") + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table( self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): if engine.name != "Download Only": raise Exception( "The PRISM dataset contains only non-tabular data files, and can only be used with the 'download only' engine." ) Script.download(self, engine, debug) clim_vars = ['ppt', 'tmax', 'tmean', 'tmin'] years = list(range(1981, 2015)) months = ["{:02d}".format(i) for i in range(1, 13)] for clim_var in clim_vars: mval = "M3" if clim_var == 'ppt' else "M2" for year in years: for month in months: file_names = self.get_file_names(clim_var, mval, year, month) file_url = urllib.parse.urljoin( self.urls["climate"], "{}/{}{}".format(clim_var, year, month)) archivename = "PRISM_{}_stable_4km{}_{}{}_bil.zip".format( clim_var, mval, year, month) self.engine.download_files_from_archive( file_url, file_names, archivename=archivename, keep_in_dir=True) self.engine.register_files(file_names)
def download(self, engine=None, debug=False): if engine.name != "Download Only": raise Exception("The Bioclim dataset contains only non-tabular data files, and can only be used with the 'download only' engine.") Script.download(self, engine, debug) file_names = ["bio%s.bil" % file_num for file_num in range(1, 20)] self.engine.download_files_from_archive(self.urls["climate"], file_names) self.engine.register_files(file_names)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine for key in self.urls: original_file_name = "trade_prdct_{}.txt".format(key) new_file_name = "trade_prdct_{}.csv".format(key) engine.download_file(self.urls[key], original_file_name) old_path = self.engine.format_filename(original_file_name) new_path = self.engine.format_filename(new_file_name) # Re-write the file with one delimeter old_data = open_fr(old_path) new_data = open_fw(new_path) # Read header line and convert "," to "|" line1 = old_data.readline().strip().replace(",", "|") new_data.write(line1 + "\n") for line in old_data: # Remove leading "|" from the data new_data.write(line.strip("|")) new_data.close() old_data.close() table = Table(key, delimiter="|") engine.auto_create_table(table, filename=new_file_name) engine.insert_data_from_file(new_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # files are nested in another baad_data folder # important files considered (baad_data.csv,baad_methods.csv) # relevant files can be added in the same manner file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"] engine.download_files_from_archive(self.urls["BAAD"], file_names) # creating data from baad_data.csv engine.auto_create_table(Table("data", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="baad_data.csv") engine.insert_data_from_file(engine.format_filename("baad_data.csv")) # creating methods from baad_methods.csv engine.auto_create_table(Table("methods", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="baad_methods.csv") engine.insert_data_from_file( engine.format_filename("baad_methods.csv"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr( self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table( key, cleanup=self.cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filenames = [ 'Aquatic_animal_excretion_data.csv', 'Aquatic_animal_excretion_variable_descriptions.csv' ] for file_paths in filenames: if not os.path.isfile(engine.format_filename(file_paths)): url = self.urls["aquatic_animals"] engine.download_files_from_archive(url, filenames, "zip") # processing Aquatic_animal_excretion_data.csv filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )), ("sourcename", ("char", )), ("speciesname", ("char", )), ("speciescode", ("char", )), ("invert/vert", ("char", )), ("phylum", ("char", )), ("class", ("char", )), ("order", ("char", )), ("family", ("char", )), ("trophicgild", ("char", )), ("drymass", ("double", )), ("logdrymass", ("double", )), ("ecosystemtype", ("char", )), ("energysource", ("char", )), ("habitat", ("char", )), ("residentecosystem", ("char", )), ("temperature", ("double", )), ("nexcretionrate", ("double", )), ("pexcretionrate", ("double", )), ("lognexcretionrate", ("double", )), ("logpexcretionrate", ("double", )), ("incubationtime", ("double", )), ("nform", ("char", )), ("pform", ("char", )), ("bodyc", ("double", )), ("bodyn", ("double", )), ("bodyp", ("double", )), ("bodyc:n", ("double", )), ("bodyc:p", ("double", )), ("bodyn:p", ("double", )), ("bodydatasource", ("char", )), ("datasource", ("char", )), ("dataproviders", ("char", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename))) # processing Aquatic_animal_excretion_variable_descriptions.csv filename = 'Aquatic_animal_excretion_variable_descriptions.csv' tablename = 'variable_descriptions' table = Table(str(tablename), delimiter=',') table.columns = [("Column", ("char", )), ("Variable", ("char", )), ("Description", ("char", )), ("Data Class", ("char", )), ("Units", ("char", )), ("Minimum_value", ("char", )), ("Maximum_value", ("char", )), ("Possible_values", ("char", )), ("Missing_data_symbol", ("char", )), ("Notes", ("char", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): data_file_name = "eBird_Observation_Dataset_2013.csv" Script.download(self, engine, debug) self.engine.download_files_from_archive(self.urls["main"], [data_file_name], filetype='gz') table = (Table("main", delimiter=",")) table.columns = [("BASISOFRECORD", ("char", )), ("INSTITUTIONCODE", ("char", )), ("COLLECTIONCODE", ("char", )), ("CATALOGNUMBER", ("char", )), ("OCCURRENCEID", ("char", )), ("RECORDEDBY", ("char", )), ("YEAR", ("int", )), ("MONTH", ("int", )), ("DAY", ("int", )), ("COUNTRY", ("char", )), ("STATEPROVINCE", ("char", )), ("COUNTY", ("char", )), ("DECIMALLATITUDE", ("double", )), ("DECIMALLONGITUDE", ("double", )), ("LOCALITY", ("char", )), ("KINGDOM", ("char", )), ("PHYLUM", ("char", )), ("CLASS", ("char", )), ("SPORDER", ("char", )), ("FAMILY", ("char", )), ("GENUS", ("char", )), ("SPECIFICEPITHET", ("char", )), ("SCIENTIFICNAME", ("char", )), ("VERNACULARNAME", ("char", )), ("INDIVIDUALCOUNT", ("int", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(data_file_name)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine for key in self.urls: original_file_name = "trade_prdct_{}.txt".format(key) new_file_name = "trade_prdct_{}.csv".format(key) engine.download_file(self.urls[key], original_file_name) old_path = self.engine.format_filename(original_file_name) new_path = self.engine.format_filename(new_file_name) # Re-write the file with one delimeter old_data = open_fr(old_path) new_data = open_fw(new_path) # Read header line and convert "," to "|" line1 = old_data.readline().strip().replace(",", "|") new_data.write(line1 + "\n") for line in old_data: # Remove leading "|" from the data new_data.write(line.strip("|")) new_data.close() old_data.close() table = Table(key, delimiter="|") engine.auto_create_table(table, filename=new_file_name) engine.insert_data_from_file(new_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file( self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename( "Succession_sampling_03-07_data.txt") old_data = open_fr( self.engine.find_file( "Succession_sampling_03-07_data_original.txt")) new_data = open_fw(data_path) # original file's header contains an end of line charactor in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table( self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) # structure_plot_year table self.engine.auto_create_table(Table("structure_plot_year"), url=self.urls["structure_plot_year"]) self.engine.insert_data_from_url(self.urls["structure_plot_year"]) # structure_plot_year table self.engine.auto_create_table(Table("plots"), url=self.urls["plots"]) self.engine.insert_data_from_url(self.urls["plots"]) # species table self.engine.download_file(self.urls["species"], "original_MSH_SPECIES_DESCRIPTORS.csv") data_path = self.engine.format_filename("MSH_SPECIES_DESCRIPTORS.csv") old_data = os.path.normpath(self.engine.find_file("original_MSH_SPECIES_DESCRIPTORS.csv")) with open(old_data, 'rU') as infile, open(data_path, 'w')as new_data: for line in infile: line = str(line).encode('utf-8') new_data.write(line) infile.close() new_data.close() self.engine.auto_create_table(Table("species"), filename="MSH_SPECIES_DESCRIPTORS.csv") self.engine.insert_data_from_file(data_path) # species_plot_year tables table = Table("species_plot_year") table.delimiter = ',' table.columns = [ ('record_id', ('pk-auto',)), ('plot_id_year', ('char',)), ('plot_name', ('char',)), ('plot_number', ('int',)), ('year', ('int',)), ('species', ('ct_column',)), ('count', ('ct-double',)) ] table.ct_column = 'species' table.ct_names = ['Abilas', 'Abipro', 'Achmil', 'Achocc', 'Agoaur', 'Agrexa', 'Agrpal', 'Agrsca', 'Alnvir', 'Anamar', 'Antmic', 'Antros', 'Aqifor', 'Arcnev', 'Arnlat', 'Astled', 'Athdis', 'Blespi', 'Brocar', 'Brosit', 'Carmer', 'Carmic', 'Carpac', 'Carpay', 'Carpha', 'Carros', 'Carspe', 'Casmin', 'Chaang', 'Cirarv', 'Cisumb', 'Crycas', 'Danint', 'Descae', 'Elyely', 'Epiana', 'Eriova', 'Eripyr', 'Fesocc', 'Fravir', 'Gencal', 'Hiealb', 'Hiegra', 'Hyprad', 'Junmer', 'Junpar', 'Juncom', 'Leppun', 'Lommar', 'Luepec', 'Luihyp', 'Luplat', 'Luplep', 'Luzpar', 'Maiste', 'Pencar', 'Pencon', 'Penser', 'Phahas', 'Phlalp', 'Phldif', 'Phyemp', 'Pincon', 'Poasec', 'Poldav', 'Polmin', 'Pollon', 'Poljun', 'Popbal', 'Potarg', 'Psemen', 'Raccan', 'Rumace', 'Salsit', 'Saxfer', 'Senspp', 'Sibpro', 'Sorsit', 'Spiden', 'Trispi', 'Tsumer', 'Vacmem', 'Vervir', 'Vioadu', 'Xerten'] self.engine.table = table self.engine.create_table() self.engine.insert_data_from_url(self.urls["species_plot_year"])
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["PanTHERIA_1-0_WR05_Aug2008.txt"], filetype="zip") # Create table Species engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table), filename="PanTHERIA_1-0_WR05_Aug2008.txt") engine.insert_data_from_file(engine.format_filename("PanTHERIA_1-0_WR05_Aug2008.txt"))
def download(self, engine=None, debug=False): if engine.name != "Download Only": raise Exception("The Bioclim dataset contains only non-tabular data files, and can only be used with the 'download only' engine.") Script.download(self, engine, debug) file_names = [] for file_num in range(1, 20): for ext in (['bil', 'hdr']): file_names += ["bio{0}.{1}".format(file_num, ext)] self.engine.download_files_from_archive(self.urls["climate"], file_names) self.engine.register_files(file_names)
def download( self, engine=None, debug=False, ): if engine.name != "Download Only": raise Exception( "The mammal-super-tree dataset contains only non-tabular data files, and can only be used with the 'download only' engine." ) Script.download(self, engine, debug)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine file_name = "PanTHERIA_1-0_WR05_Aug2008.txt" engine.download_files_from_archive(self.urls["data"], [file_name], "zip") # Create table Species engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table), filename=file_name) engine.insert_data_from_file(engine.format_filename(file_name))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt"], filetype="zip") # Create table Species engine.auto_create_table(Table('main', cleanup=self.cleanup_func_table), filename="Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt") engine.insert_data_from_file( engine.format_filename("Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt"))
def download(self, engine=None, debug=False): if engine.name != "Download Only": raise Exception("The Bioclim dataset contains only " "non-tabular data files, and can only " "be used with the 'download only' engine.") Script.download(self, engine, debug) file_names = [] for file_num in range(1, 20): for ext in (['bil', 'hdr']): file_names += ["bio{0}.{1}".format(file_num, ext)] self.engine.download_files_from_archive(self.urls["climate"], file_names) self.engine.register_files(file_names)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = "Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt" engine.download_files_from_archive(self.urls["data"], [filename], filetype="zip") # Create table Species engine.auto_create_table(Table('main', cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine file_names = [ ('Flensburg_Data_Links.csv','links'), ('Flensburg_Data_Nodes.csv','nodes') ] engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], filetype="zip", archivename="ECOL_92_174") for(filename,tablename) in file_names: data_path = self.engine.format_filename(filename) self.engine.auto_create_table(Table(str(tablename), cleanup=self.cleanup_func_table),filename=filename) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["PanTHERIA_1-0_WR05_Aug2008.txt"], filetype="zip") # Create table Species engine.auto_create_table(Table('species', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="PanTHERIA_1-0_WR05_Aug2008.txt") engine.insert_data_from_file( engine.format_filename("PanTHERIA_1-0_WR05_Aug2008.txt"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [ ("index", ("pk-int",)), ("sourcenumber", ("int",)), ("sourcename", ("char",)), ("speciesname", ("char",)), ("speciescode", ("char",)), ("invert/vert", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("trophicgild", ("char",)), ("drymass", ("double",)), ("logdrymass", ("double",)), ("ecosystemtype", ("char",)), ("energysource", ("char",)), ("habitat", ("char",)), ("residentecosystem", ("char",)), ("temperature", ("double",)), ("nexcretionrate", ("double",)), ("pexcretionrate", ("double",)), ("lognexcretionrate", ("double",)), ("logpexcretionrate", ("double",)), ("incubationtime", ("double",)), ("nform", ("char",)), ("pform", ("char",)), ("bodyc", ("double",)), ("bodyn", ("double",)), ("bodyp", ("double",)), ("bodyc:n", ("double",)), ("bodyc:p", ("double",)), ("bodyn:p", ("double",)), ("bodydatasource", ("char",)), ("datasource", ("char",)), ("dataproviders", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine file_names = [ ('isotopes.csv','isotopes'), ('sources.csv','sources'), ('diet.csv', 'diet') ] engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], filetype="zip", archivename="ECOL_92_97") for(filename,tablename) in file_names: data_path = self.engine.format_filename(filename) self.engine.auto_create_table(Table(str(tablename), cleanup=self.cleanup_func_table),filename=filename) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["UPSP_Demo_data.txt", "UPSP_Species_list2.txt"], filetype="zip") # Create table sp_list(Species) engine.auto_create_table(Table('sp_list', cleanup=self.cleanup_func_table), filename="UPSP_Species_list2.txt") engine.insert_data_from_file(engine.format_filename("UPSP_Species_list2.txt")) # Create table ind_loc_girth engine.auto_create_table(Table('ind_loc_girth', cleanup=self.cleanup_func_table), filename="UPSP_Demo_data.txt") engine.insert_data_from_file(engine.format_filename("UPSP_Demo_data.txt"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine file_names = [('Flensburg_Data_Links.csv', 'links'), ('Flensburg_Data_Nodes.csv', 'nodes')] engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], "zip", False, "ECOL_92_174") for (filename, tablename) in file_names: data_path = self.engine.format_filename(filename) self.engine.auto_create_table(Table( str(tablename), cleanup=self.cleanup_func_table), filename=filename) self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): if engine.name != "Download Only": raise Exception("The PRISM dataset contains only non-tabular data files, and can only be used with the 'download only' engine.") Script.download(self, engine, debug) clim_vars = ['ppt', 'tmax', 'tmean', 'tmin'] years = list(range(1981, 2015)) months = ["{:02d}".format(i) for i in range(1,13)] for clim_var in clim_vars: mval = "M3" if clim_var == 'ppt' else "M2" for year in years: for month in months: file_names = self.get_file_names(clim_var, mval, year, month) file_url = urllib.parse.urljoin(self.urls["climate"], "{}/{}{}".format(clim_var, year, month)) archivename = "PRISM_{}_stable_4km{}_{}{}_bil.zip".format(clim_var, mval, year, month) self.engine.download_files_from_archive(file_url, file_names, archivename=archivename, keep_in_dir=True) self.engine.register_files(file_names)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"] engine.download_files_from_archive(self.urls["data"], files, filetype="zip") # Create table species engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table), filename="Species_list.txt") engine.insert_data_from_file(engine.format_filename("Species_list.txt")) # Create table sites engine.auto_create_table(Table('sites', cleanup=self.cleanup_func_table), filename="Site_variables.txt") engine.insert_data_from_file(engine.format_filename("Site_variables.txt")) # Create table microplots table = Table('microplots') table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))] table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9', 'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17', 'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26', 'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35', 'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46', 'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54', 'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62', 'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70', 'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79', 'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88', 'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96', 'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104'] table.ct_column = 'PlotID' engine.auto_create_table(table, filename="Microplot_data.txt") engine.insert_data_from_file(engine.format_filename("Microplot_data.txt")) # Create table microplots table = Table('macroplots') table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5'] table.ct_column = 'Tree' table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')), ('Girth', ('ct-int',))] engine.auto_create_table(table, filename="Macroplot_data_Rev.txt") engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["trees"], "LS_trees_1983_2000_original.txt") data_path = self.engine.format_filename("LS_trees_1983_2000.txt") old_data = open(self.engine.find_file("LS_trees_1983_2000_original.txt"), 'rb') new_data = open(data_path, 'wb') last_line = None for line in old_data: if last_line: new_data.write(last_line) last_line = line new_data.close() old_data.close() self.engine.auto_create_table(self.tables["trees"], filename="LS_trees_1983_2000.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine taxa = ('Plant', 'Animal') for tax in taxa: table = Table(tax.lower() + 's', delimiter=',', header_rows = 3, pk='record_id', contains_pk=True) columns = [("record_id" , ("pk-int",) ), ("station_id" , ("int",) ), ("obs_date" , ("char",) ), ("ind_id" , ("int",) ), ("sci_name" , ("char",) ), ("com_name" , ("char",) ), ("kingdom" , ("char",) ), ("pheno_cat" , ("char",) ), ("pheno_name" , ("char",) ), ("pheno_status" , ("char",) ), ("lat" , ("double",) ), ("lon" , ("double",) ), ("elevation" , ("int",) ), ("network_name" , ("char",) )] table.columns = columns engine.table = table engine.create_table() base_url = 'http://www.usanpn.org/getObs/observations/' years = range(2009, 2013) for year in years: if year == 2009 and tax == 'Animal': continue url = base_url + 'get%s%sDataNoDefinitions' % (year, tax) filename = '%s_%s.csv' % (tax, year) engine.download_file(url, filename) engine.insert_data_from_file(engine.find_file(filename)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["trees"], "LS_trees_1983_2000_original.txt") data_path = self.engine.format_filename("LS_trees_1983_2000.txt") old_data = open(self.engine.find_file("LS_trees_1983_2000_original.txt"), 'rb') new_data = open(data_path, 'wb') last_line = None for line in old_data: if last_line: new_data.write(last_line) last_line = line new_data.close() old_data.close() self.engine.auto_create_table(self.tables["trees"], filename="LS_trees_1983_2000.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"] engine.download_files_from_archive(self.urls["data"], files, filetype="zip") # Create table species engine.auto_create_table(Table('species', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="Species_list.txt") engine.insert_data_from_file(engine.format_filename("Species_list.txt")) # Create table sites engine.auto_create_table(Table('sites', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="Site_variables.txt") engine.insert_data_from_file(engine.format_filename("Site_variables.txt")) # Create table microplots table = Table('microplots') table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))] table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9', 'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17', 'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26', 'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35', 'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46', 'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54', 'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62', 'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70', 'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79', 'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88', 'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96', 'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104'] table.ct_column = 'PlotID' engine.auto_create_table(table, filename="Microplot_data.txt") engine.insert_data_from_file(engine.format_filename("Microplot_data.txt")) # Create table microplots table = Table('macroplots') table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5'] table.ct_column = 'Tree' table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')), ('Girth', ('ct-int',))] engine.auto_create_table(table, filename="Macroplot_data_Rev.txt") engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )), ("sourcename", ("char", )), ("speciesname", ("char", )), ("speciescode", ("char", )), ("invert/vert", ("char", )), ("phylum", ("char", )), ("class", ("char", )), ("order", ("char", )), ("family", ("char", )), ("trophicgild", ("char", )), ("drymass", ("double", )), ("logdrymass", ("double", )), ("ecosystemtype", ("char", )), ("energysource", ("char", )), ("habitat", ("char", )), ("residentecosystem", ("char", )), ("temperature", ("double", )), ("nexcretionrate", ("double", )), ("pexcretionrate", ("double", )), ("lognexcretionrate", ("double", )), ("logpexcretionrate", ("double", )), ("incubationtime", ("double", )), ("nform", ("char", )), ("pform", ("char", )), ("bodyc", ("double", )), ("bodyn", ("double", )), ("bodyp", ("double", )), ("bodyc:n", ("double", )), ("bodyc:p", ("double", )), ("bodyn:p", ("double", )), ("bodydatasource", ("char", )), ("datasource", ("char", )), ("dataproviders", ("char", ))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine engine.download_files_from_archive(self.urls["data"], ["UPSP_Demo_data.txt", "UPSP_Species_list2.txt"], archive_type="zip") # Create table sp_list(Species) filename = "UPSP_Species_list2.txt" engine.auto_create_table( Table('sp_list', cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) # Create table ind_loc_girth filename = "UPSP_Demo_data.txt" engine.auto_create_table( Table('ind_loc_girth', cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) # IMG request_query = "https://viewer.nationalmap.gov/tnmaccess/api/products?&bbox={}&q=&start=&end=&dateType=&datasets=National+Elevation+Dataset+(NED)+1/3+arc-second&prodFormats=IMG&prodExtents=1+x+1+degree&polyCode=&polyType=&max=40&offset=0&_=1519665242114".format(",".join(str(i) for i in engine.opts["bbox"] if i)) engine = self.engine res = requests.get(request_query).text data_url = json.loads(res) from retriever.lib.table import RasterDataset for item in data_url["items"]: engine.download_files_from_archive(item["downloadURL"]) for raster_files in engine.supported_raster(engine.format_data_dir(), [".img"]): base_name = os.path.basename(raster_files) filename, file_extension = os.path.splitext(base_name) table = RasterDataset(name=filename) engine.table = table engine.auto_create_table(table, filename=os.path.basename(raster_files)) engine.insert_raster(raster_files)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # files are nested in another baad_data folder # important files considered (baad_data.csv,baad_methods.csv) # relevant files can be added in the same manner file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"] engine.download_files_from_archive(self.urls["BAAD"], file_names) # creating data from baad_data.csv engine.auto_create_table(Table("data", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="baad_data.csv") engine.insert_data_from_file(engine.format_filename("baad_data.csv")) # creating methods from baad_methods.csv engine.auto_create_table(Table("methods", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])), filename="baad_methods.csv") engine.insert_data_from_file(engine.format_filename("baad_methods.csv"))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename("Succession_sampling_03-07_data.txt") old_data = open_fr(self.engine.find_file("Succession_sampling_03-07_data_original.txt")) new_data = open_fw(data_path) # original file's header contains an end of line charactor in the middle hence creating two lines # Read in the two lines and create the full header line1 = old_data.readline().strip() line2 = old_data.readline() newline = line1 + "\t" + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table(self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) # IMG request_query = "https://viewer.nationalmap.gov/tnmaccess/api/products?&bbox={}&q=&start=&end=&dateType=&datasets=National+Elevation+Dataset+(NED)+1/3+arc-second&prodFormats=IMG&prodExtents=1+x+1+degree&polyCode=&polyType=&max=40&offset=0&_=1519665242114".format( ",".join(str(i) for i in engine.opts["bbox"] if i)) engine = self.engine res = requests.get(request_query).text data_url = json.loads(res) from retriever.lib.table import RasterDataset for item in data_url["items"]: engine.download_files_from_archive(item["downloadURL"]) for raster_files in engine.supported_raster(engine.format_data_dir(), [".img"]): base_name = os.path.basename(raster_files) filename, file_extension = os.path.splitext(base_name) table = RasterDataset(name=filename) engine.table = table engine.auto_create_table(table, filename=os.path.basename(raster_files)) engine.insert_raster(raster_files)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # files are nested in another baad_data folder # important files considered (baad_data.csv,baad_methods.csv) # relevant files can be added in the same manner file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"] engine.download_files_from_archive(self.urls["BAAD"], file_names) # creating data from baad_data.csv if parse_version(VERSION).__str__() >= parse_version( "2.1.dev").__str__(): filename = "baad_data/baad_data.csv" engine.auto_create_table(Table("data", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) else: filename = "baad_data.csv" engine.auto_create_table(Table("data", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) # creating methods from baad_methods.csv if parse_version(VERSION).__str__() >= parse_version( "2.1.dev").__str__(): filename = "baad_data/baad_methods.csv" engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) else: filename = "baad_methods.csv" engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.download_file(self.urls["main"], "Succession_sampling_03-07_data_original.txt") data_path = self.engine.format_filename("Succession_sampling_03-07_data.txt") old_data = open(self.engine.find_file("Succession_sampling_03-07_data_original.txt"), 'rb') new_data = open(data_path, 'wb') line1 = old_data.readline() line2 = old_data.readline() newline = line1.replace("\n", "\t") + line2 new_data.write(newline) for line in old_data: new_data.write(line) new_data.close() old_data.close() self.engine.auto_create_table(self.tables["main"], filename="Succession_sampling_03-07_data.txt") self.engine.insert_data_from_file(data_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) for key in self.urls: self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1]) new_file_path = self.engine.format_filename("new" + key) old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1])) new_data = open_fw(new_file_path) with old_data as file_block: # after the metadata lines, set data to True data = False for lines in file_block.readlines(): # meta data contins line with no ";" and may have "(;;;;)+" or empty lines if not data and (";" not in lines or ";;;;" in lines): pass else: data = True new_data.write(lines) file_block.close() new_data.close() self.engine.auto_create_table(Table(key, cleanup=self.cleanup_func_table), filename=str("new" + key)) self.engine.insert_data_from_file(new_file_path)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # files are nested in another baad_data folder # important files considered (baad_data.csv,baad_methods.csv) # relevant files can be added in the same manner file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"] engine.download_files_from_archive(self.urls["BAAD"], file_names) # creating data from baad_data.csv if parse_version(VERSION).__str__() >= parse_version("2.1.dev").__str__(): filename = "baad_data/baad_data.csv" engine.auto_create_table(Table("data", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) else: filename = "baad_data.csv" engine.auto_create_table(Table("data", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) # creating methods from baad_methods.csv if parse_version(VERSION).__str__() >= parse_version("2.1.dev").__str__(): filename = "baad_data/baad_methods.csv" engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename)) else: filename = "baad_methods.csv" engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table), filename=filename) engine.insert_data_from_file(engine.format_filename(filename))
def download(self, engine=None, debug=False): data_file_name = "eBird_Observation_Dataset_2013.csv" Script.download(self, engine, debug) self.engine.download_files_from_archive(self.urls["main"], [data_file_name], filetype='gz') table = (Table("main", delimiter=",")) table.columns=[("BASISOFRECORD",("char", )), ("INSTITUTIONCODE",("char", )), ("COLLECTIONCODE",("char", )), ("CATALOGNUMBER",("char", )), ("OCCURRENCEID",("char", )), ("RECORDEDBY",("char", )), ("YEAR",("int", )), ("MONTH",("int", )), ("DAY",("int", )), ("COUNTRY",("char", )), ("STATEPROVINCE",("char", )), ("COUNTY",("char", )), ("DECIMALLATITUDE",("double", )), ("DECIMALLONGITUDE",("double", )), ("LOCALITY",("char", )), ("KINGDOM",("char", )), ("PHYLUM",("char", )), ("CLASS",("char", )), ("SPORDER",("char", )), ("FAMILY",("char", )), ("GENUS",("char", )), ("SPECIFICEPITHET",("char", )), ("SCIENTIFICNAME",("char", )), ("VERNACULARNAME",("char", )), ("INDIVIDUALCOUNT",("int", ))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(data_file_name)) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine # Download both full and abbreviated versions and extract the data files abbrev_version = ["ABBREV.txt"] full_version = [ "DERIV_CD.txt", "FOOTNOTE.txt", "NUTR_DEF.txt", "WEIGHT.txt", "DATA_SRC.txt", "FD_GROUP.txt", "LANGDESC.txt", "NUT_DATA.txt", "DATSRCLN.txt", "FOOD_DES.txt", "LANGUAL.txt", "SRC_CD.txt" ] self.engine.download_files_from_archive(self.urls["full_version"], archive_type="zip", file_names=full_version) self.engine.download_files_from_archive( self.urls["abbreviated_version"], archive_type="zip", file_names=abbrev_version, ) # Convert original txt to csv convert_to_csv(self.engine.format_data_dir()) # FOOD_DES table new_file_name = "food_des.csv" table = Table("food_des", delimiter=",", header_rows=0) table.columns = [ ("ndb_no", ("int", )), ("fdgrp_cd", ("int", )), ("long_desc", ("char", "205")), ("shrt_desc", ("char", "65")), ("comname", ("char", "105")), ("manufacname", ("char", "70")), ("survey", ("char", "1")), ("ref_desc", ("char", "140")), ("refuse", ("double", )), ("sciname", ("char", "67")), ("n_factor", ("double", )), ("pro_factor", ("double", )), ("fat_factor", ("double", )), ("cho_factor", ("double", )), ] self.create_and_install(new_file_name, table) # FdGrp_Cd table new_file_name = "fd_group.csv" table = Table("fd_group", delimiter=",", header_rows=0) table.columns = [("fdgrp_cd", ("int", )), ("fdgrp_desc", ("char", "65"))] self.create_and_install(new_file_name, table) # LANGUAL table new_file_name = "langual.csv" table = Table("langual", delimiter=",", header_rows=0) table.columns = [("ndb_no", ("int", )), ("factor_code", ("char", "5"))] self.create_and_install(new_file_name, table) # LANGDESC Table new_file_name = "langdesc.csv" table = Table("langdesc", delimiter=",", header_rows=0) table.columns = [ ("factor_code", ("char", "5")), ("description", ("char", "145")), ] self.create_and_install(new_file_name, table) # NUT_DATA table new_file_name = "nut_data.csv" missingValues = [ "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "Unnamed: 12", "Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 17" ] table = Table( "nut_data", delimiter=",", header_rows=0, missingValues=missingValues, do_not_bulk_insert=True, ) table.columns = [ ("ndb_no", ("int", )), ("nutr_no", ("int", )), ("nutr_val", ("double", )), ("num_data_pts", ("int", )), ("std_error", ("double", )), ("src_cd", ("int", )), ("deriv_cd", ("char", "12")), ("ref_ndb_no", ("double", )), ("add_nutr_mark", ("char", "12")), ("num_studies", ("double", )), ("min", ("double", )), ("max", ("double", )), ("df", ("double", )), ("low_eb", ("double", )), ("up_eb", ("double", )), ("stat_cmt", ("char", "12")), ("addmod_date", ("char", "12")), ("cc", ("char", "12")), ] self.create_and_install(new_file_name, table) # NUTR_DEF table new_file_name = "nutr_def.csv" table = Table("nutr_def", delimiter=",", header_rows=0) table.columns = [ ("nutr_no", ("int", )), ("units", ("char", "10")), ("tagname", ("char", "25")), ("nutrdesc", ("char", "60")), ("num_dec", ("int", )), ("sr_order", ("int", )), ] self.create_and_install(new_file_name, table) # SRC_CD table new_file_name = "src_cd.csv" table = Table("src_cd", delimiter=",", header_rows=0) table.columns = [("src_cd", ("int", )), ("srccd_desc", ("char", "65"))] self.create_and_install(new_file_name, table) # DERIV_CD table new_file_name = "deriv_cd.csv" table = Table("deriv_cd", delimiter=",", header_rows=0) table.columns = [("deriv_cd", ("char", "5")), ("deriv_desc", ("char", "130"))] self.create_and_install(new_file_name, table) # WEIGHT table new_file_name = "weight.csv" table = Table( "weight", delimiter=",", header_rows=0, missingValues=["Unnamed: 5", "Unnamed: 6"], ) table.columns = [ ("ndb_no", ("int", )), ("seq", ("int", )), ("amount", ("double", )), ("msre_desc", ("char", "130")), ("gm_wgt", ("double", )), ("num_data_pts", ("double", )), ("std_dev", ("double", )), ] self.create_and_install(new_file_name, table) # FOOTNOTE table new_file_name = "footnote.csv" table = Table("footnote", delimiter=",", header_rows=0, missingValues=["Unnamed: 3"]) table.columns = [ ("ndb_no", ("int", )), ("footnt_no", ("int", )), ("footnt_typ", ("char", "2")), ("nutr_no", ("double", )), ("footnt_txt", ("char", "200")), ] self.create_and_install(new_file_name, table) # DATSRCLN table new_file_name = "datsrcln.csv" table = Table("datsrcln", delimiter=",", header_rows=0) table.columns = [ ("ndb_no", ("int", )), ("nutr_no", ("int", )), ("datasrc_id", ("char", "7")), ] self.create_and_install(new_file_name, table) # DATA_SRC table new_file_name = "data_src.csv" table = Table("data_src", delimiter=",", header_rows=0) table.columns = [ ("datasrc_id", ("char", "7")), ("authors", ("char", "257")), ("title", ("char", "257")), ("year", ("char", "5")), ("journal", ("char", "137")), ("vol_city", ("char", "17")), ("issue_state", ("char", "5")), ("start_page", ("char", "5")), ("end_page", ("char", "5")), ] self.create_and_install(new_file_name, table) # ABBREV table new_file_name = "abbrev.csv" table = Table("abbrev", delimiter=",", header_rows=0) table.columns = [ ("ndb_no", ("char", "7")), ("shrt_desc", ("char", "60")), ("water", ("double", )), ("energ_kcal", ("int", )), ("protein", ("double", )), ("lipid_tot", ("double", )), ("ash", ("double", )), ("carbohydrt", ("double", )), ("fiber_td", ("double", )), ("sugar_tot", ("char", "6")), ("calcium", ("int", )), ("iron", ("double", )), ("magnesium", ("int", )), ("phosphorus", ("int", )), ("potassium", ("int", )), ("sodium", ("int", )), ("zinc", ("double", )), ("copper", ("double", )), ("manganese", ("double", )), ("selenium", ("double", )), ("vit_c", ("double", )), ("thiamin", ("double", )), ("riboflavin", ("double", )), ("niacin", ("double", )), ("panto_acid", ("double", )), ("vit_b6", ("double", )), ("folate_tot", ("int", )), ("folic_acid", ("int", )), ("food_folate", ("int", )), ("folate_dfe", ("int", )), ("choline_tot", ("double", )), ("vit_b12", ("double", )), ("vit_a_iu", ("int", )), ("vit_a_rae", ("int", )), ("retinol", ("int", )), ("alpha_carot", ("int", )), ("beta_carot", ("int", )), ("beta_crypt", ("int", )), ("lycopene", ("int", )), ("lut_zea", ("int", )), ("vit_e", ("double", )), ("vit_d_mcg", ("double", )), ("vit_d_iu", ("int", )), ("vit_k", ("double", )), ("fa_sat", ("double", )), ("fa_mono", ("double", )), ("fa_poly", ("double", )), ("cholestrl", ("int", )), ("gmwt_1", ("double", )), ("gmwt_desc1", ("char", "80")), ("gmwt_2", ("double", )), ("gmwt_desc2", ("char", "80")), ("refuse_pct", ("int", )), ] self.create_and_install(new_file_name, table)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine table = self.tables["mass"] # Database column names and their data types. Use data type "skip" to skip the value, and # "combine" to merge a string value into the previous column table.columns=[("record_id" , ("pk-auto",) ), ("family" , ("char", 20) ), ("genus" , ("char", 20) ), ("species" , ("char", 20) ), ("subspecies" , ("char", 20) ), ("common_name" , ("char", 50) ), ("sex" , ("char", 20) ), ("N" , ("double",) ), ("mean" , ("double",) ), ("std_dev" , ("double",) ), ("min" , ("double",) ), ("max" , ("double",) ), ("season" , ("char",2) ), ("location" , ("char",50) ), ("source_num" , ("char",50) )] engine.table = table engine.create_table() file_list = ["broadbills - tapaculos", "cotingas - NZ wrens", "HA honeycreepers - icterids", "honeyeaters - corvids", "jacanas - doves", "larks - accentors", "muscicapids - babblers", "ostrich - waterfowl", "parrotbills - sugarbirds", "parrots - nightjars", "starlings - finches", "swifts - woodpeckers", "thrushes - gnatcatchers", "vultures - bustards"] lines = [] for file in file_list: filename = file + ".xls" full_filename = engine.format_filename(filename) # Make sure file exists if not os.path.isfile(full_filename): raise Exception("Missing raw data file: " + full_filename) # Open excel file with xlrd book = xlrd.open_workbook(full_filename) sh = book.sheet_by_index(0) print "Inserting data from " + filename + " . . ." rows = sh.nrows cols = 11 lines = [] lastrow = None lastvalues = None family = "" for n in range(rows): row = sh.row(n) if len(row) == 0: continue empty_cols = len([cell for cell in row[0:11] if Excel.empty_cell(cell)]) # Skip this row if all cells or all cells but one are empty # or if it's the legend row if ((empty_cols == cols) or Excel.cell_value(row[0]) == "Scientific Name" or Excel.cell_value(row[0])[0:7] == "Species"): pass elif empty_cols == cols - 1: if "Family" in Excel.cell_value(row[0]): family = Excel.cell_value(row[0]).lstrip("Family ").title() continue else: if not Excel.empty_cell(row[0]): lastvalues[3] = Excel.cell_value(row[0]) else: # Values: 0=Family 1=Genus 2=Species 3=Subspecies 4=common name 5=sex # 6=N 7=Mean 8=std_dev 9=min 10=max 11=season 12=location 13=source_num values = [] values.append(family) # If the first two columns are empty, but not all of them are, # use the first two columns from the previous row if Excel.empty_cell(row[0]) and Excel.empty_cell(row[1]): [values.append(value) for value in sci_name(Excel.cell_value(lastrow[0]))] values.append(Excel.cell_value(lastrow[1])) else: if len(Excel.cell_value(row[0]).split()) == 1: # If the scientific name is missing genus/species, fill it # in from the previous row values.append(lastvalues[1]) values.append(lastvalues[2]) values.append(lastvalues[3]) for i in range(0, 3): if not values[3-i]: values[3-i] = Excel.cell_value(row[0]) break # Add new information to the previous scientific name if lastvalues: lastvalues[1:4] = values[1:4] else: [values.append(value) for value in sci_name(Excel.cell_value(row[0]))] values.append(Excel.cell_value(row[1])) if Excel.cell_value(row[2]) == "M": values.append("Male") elif Excel.cell_value(row[2]) == "F": values.append("Female") elif Excel.cell_value(row[2]) == "B": values.append("Both") elif Excel.cell_value(row[2]) == "U": values.append("Unknown") else: values.append(Excel.cell_value(row[2])) # Enter remaining values from cells for i in range(3, cols): values.append(Excel.cell_value(row[i])) # If there isn't a common name or location, get it from # the previous row if not values[4]: values[4] = lastvalues[4] if not values[12]: if lastvalues: if lastvalues[5]: if lastvalues[5] == "Male" and values[5] == "Female": values[12] = lastvalues[12] # Insert the previous row into the database if lastvalues: lines.append('~'.join(lastvalues)) lastrow = row lastvalues = values if lines: lines.append('~'.join(lastvalues)) engine.add_to_table(lines) return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'vertnet_latest_reptiles.csv' tablename = 'reptiles' table = Table(str(tablename), delimiter=',') table.columns = [ ("record_id", ("pk-auto",)), ("beginrecord", ("char",)), ("icode", ("char",)), ("title", ("char",)), ("citation", ("char",)), ("contact", ("char",)), ("email", ("char",)), ("emlrights", ("char",)), ("gbifdatasetid", ("char",)), ("gbifpublisherid", ("char",)), ("doi", ("char",)), ("migrator", ("char",)), ("networks", ("char",)), ("orgcountry", ("char",)), ("orgname", ("char",)), ("orgstateprovince", ("char",)), ("pubdate", ("char",)), ("source_url", ("char",)), ("iptrecordid", ("char",)), ("associatedmedia", ("char",)), ("associatedoccurrences", ("char",)), ("associatedorganisms", ("char",)), ("associatedreferences", ("char",)), ("associatedsequences", ("char",)), ("associatedtaxa", ("char",)), ("bed", ("char",)), ("behavior", ("char",)), ("catalognumber", ("char",)), ("continent", ("char",)), ("coordinateprecision", ("char",)), ("coordinateuncertaintyinmeters", ("char",)), ("country", ("char",)), ("countrycode", ("char",)), ("county", ("char",)), ("dateidentified", ("char",)), ("day", ("char",)), ("decimallatitude", ("char",)), ("decimallongitude", ("char",)), ("disposition", ("char",)), ("earliestageorloweststage", ("char",)), ("earliesteonorlowesteonothem", ("char",)), ("earliestepochorlowestseries", ("char",)), ("earliesteraorlowesterathem", ("char",)), ("earliestperiodorlowestsystem", ("char",)), ("enddayofyear", ("char",)), ("establishmentmeans", ("char",)), ("eventdate", ("char",)), ("eventid", ("char",)), ("eventremarks", ("char",)), ("eventtime", ("char",)), ("fieldnotes", ("char",)), ("fieldnumber", ("char",)), ("footprintspatialfit", ("char",)), ("footprintsrs", ("char",)), ("footprintwkt", ("char",)), ("formation", ("char",)), ("geodeticdatum", ("char",)), ("geologicalcontextid", ("char",)), ("georeferencedby", ("char",)), ("georeferenceddate", ("char",)), ("georeferenceprotocol", ("char",)), ("georeferenceremarks", ("char",)), ("georeferencesources", ("char",)), ("georeferenceverificationstatus", ("char",)), ("group", ("char",)), ("habitat", ("char",)), ("highergeography", ("char",)), ("highergeographyid", ("char",)), ("highestbiostratigraphiczone", ("char",)), ("identificationid", ("char",)), ("identificationqualifier", ("char",)), ("identificationreferences", ("char",)), ("identificationremarks", ("char",)), ("identificationverificationstatus", ("char",)), ("identifiedby", ("char",)), ("individualcount", ("char",)), ("island", ("char",)), ("islandgroup", ("char",)), ("latestageorhigheststage", ("char",)), ("latesteonorhighesteonothem", ("char",)), ("latestepochorhighestseries", ("char",)), ("latesteraorhighesterathem", ("char",)), ("latestperiodorhighestsystem", ("char",)), ("lifestage", ("char",)), ("lithostratigraphicterms", ("char",)), ("locality", ("char",)), ("locationaccordingto", ("char",)), ("locationid", ("char",)), ("locationremarks", ("char",)), ("lowestbiostratigraphiczone", ("char",)), ("materialsampleid", ("char",)), ("maximumdepthinmeters", ("char",)), ("maximumdistanceabovesurfaceinmeters", ("char",)), ("maximumelevationinmeters", ("char",)), ("member", ("char",)), ("minimumdepthinmeters", ("char",)), ("minimumdistanceabovesurfaceinmeters", ("char",)), ("minimumelevationinmeters", ("char",)), ("month", ("char",)), ("municipality", ("char",)), ("occurrenceid", ("char",)), ("occurrenceremarks", ("char",)), ("occurrencestatus", ("char",)), ("organismid", ("char",)), ("organismname", ("char",)), ("organismremarks", ("char",)), ("organismscope", ("char",)), ("othercatalognumbers", ("char",)), ("pointradiusspatialfit", ("char",)), ("preparations", ("char",)), ("previousidentifications", ("char",)), ("recordedby", ("char",)), ("recordnumber", ("char",)), ("reproductivecondition", ("char",)), ("samplingeffort", ("char",)), ("samplingprotocol", ("char",)), ("sex", ("char",)), ("startdayofyear", ("char",)), ("stateprovince", ("char",)), ("typestatus", ("char",)), ("verbatimcoordinates", ("char",)), ("verbatimcoordinatesystem", ("char",)), ("verbatimdepth", ("char",)), ("verbatimelevation", ("char",)), ("verbatimeventdate", ("char",)), ("verbatimlatitude", ("char",)), ("verbatimlocality", ("char",)), ("verbatimlongitude", ("char",)), ("verbatimsrs", ("char",)), ("waterbody", ("char",)), ("year", ("char",)), ("dctype", ("char",)), ("modified", ("char",)), ("language", ("char",)), ("license", ("char",)), ("rightsholder", ("char",)), ("accessrights", ("char",)), ("bibliographiccitation", ("char",)), ("dc_references", ("char",)), ("institutionid", ("char",)), ("collectionid", ("char",)), ("datasetid", ("char",)), ("institutioncode", ("char",)), ("collectioncode", ("char",)), ("datasetname", ("char",)), ("ownerinstitutioncode", ("char",)), ("basisofrecord", ("char",)), ("informationwithheld", ("char",)), ("datageneralizations", ("char",)), ("dynamicproperties", ("char",)), ("scientificnameid", ("char",)), ("namepublishedinid", ("char",)), ("scientificname", ("char",)), ("acceptednameusage", ("char",)), ("originalnameusage", ("char",)), ("namepublishedin", ("char",)), ("namepublishedinyear", ("char",)), ("higherclassification", ("char",)), ("kingdom", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("genus", ("char",)), ("subgenus", ("char",)), ("specificepithet", ("char",)), ("infraspecificepithet", ("char",)), ("taxonrank", ("char",)), ("verbatimtaxonrank", ("char",)), ("scientificnameauthorship", ("char",)), ("vernacularname", ("char",)), ("nomenclaturalcode", ("char",)), ("taxonomicstatus", ("char",)), ("keyname", ("char",)), ("haslicense", ("int",)), ("vntype", ("char",)), ("rank", ("int",)), ("mappable", ("int",)), ("hashid", ("char",)), ("hastypestatus", ("int",)), ("wascaptive", ("int",)), ("wasinvasive", ("int",)), ("hastissue", ("int",)), ("hasmedia", ("int",)), ("isfossil", ("int",)), ("haslength", ("int",)), ("haslifestage", ("int",)), ("hasmass", ("int",)), ("hassex", ("int",)), ("lengthinmm", ("double",)), ("massing", ("double",)), ("lengthunitsinferred", ("char",)), ("massunitsinferred", ("char",)), ("underivedlifestage", ("char",)), ("underivedsex", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], "zip", False, "vertnet_latest_" + str(tablename)) engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = 'vertnet_latest_mammals.csv' tablename = 'mammals' table = Table(str(tablename), delimiter=',') table.columns = [ ("record_id", ("pk-auto",)), ("beginrecord", ("char",)), ("icode", ("char",)), ("title", ("char",)), ("citation", ("char",)), ("contact", ("char",)), ("email", ("char",)), ("emlrights", ("char",)), ("gbifdatasetid", ("char",)), ("gbifpublisherid", ("char",)), ("doi", ("char",)), ("migrator", ("char",)), ("networks", ("char",)), ("orgcountry", ("char",)), ("orgname", ("char",)), ("orgstateprovince", ("char",)), ("pubdate", ("char",)), ("source_url", ("char",)), ("iptrecordid", ("char",)), ("associatedmedia", ("char",)), ("associatedoccurrences", ("char",)), ("associatedorganisms", ("char",)), ("associatedreferences", ("char",)), ("associatedsequences", ("char",)), ("associatedtaxa", ("char",)), ("bed", ("char",)), ("behavior", ("char",)), ("catalognumber", ("char",)), ("continent", ("char",)), ("coordinateprecision", ("char",)), ("coordinateuncertaintyinmeters", ("char",)), ("country", ("char",)), ("countrycode", ("char",)), ("county", ("char",)), ("dateidentified", ("char",)), ("day", ("char",)), ("decimallatitude", ("char",)), ("decimallongitude", ("char",)), ("disposition", ("char",)), ("earliestageorloweststage", ("char",)), ("earliesteonorlowesteonothem", ("char",)), ("earliestepochorlowestseries", ("char",)), ("earliesteraorlowesterathem", ("char",)), ("earliestperiodorlowestsystem", ("char",)), ("enddayofyear", ("char",)), ("establishmentmeans", ("char",)), ("eventdate", ("char",)), ("eventid", ("char",)), ("eventremarks", ("char",)), ("eventtime", ("char",)), ("fieldnotes", ("char",)), ("fieldnumber", ("char",)), ("footprintspatialfit", ("char",)), ("footprintsrs", ("char",)), ("footprintwkt", ("char",)), ("formation", ("char",)), ("geodeticdatum", ("char",)), ("geologicalcontextid", ("char",)), ("georeferencedby", ("char",)), ("georeferenceddate", ("char",)), ("georeferenceprotocol", ("char",)), ("georeferenceremarks", ("char",)), ("georeferencesources", ("char",)), ("georeferenceverificationstatus", ("char",)), ("group", ("char",)), ("habitat", ("char",)), ("highergeography", ("char",)), ("highergeographyid", ("char",)), ("highestbiostratigraphiczone", ("char",)), ("identificationid", ("char",)), ("identificationqualifier", ("char",)), ("identificationreferences", ("char",)), ("identificationremarks", ("char",)), ("identificationverificationstatus", ("char",)), ("identifiedby", ("char",)), ("individualcount", ("char",)), ("island", ("char",)), ("islandgroup", ("char",)), ("latestageorhigheststage", ("char",)), ("latesteonorhighesteonothem", ("char",)), ("latestepochorhighestseries", ("char",)), ("latesteraorhighesterathem", ("char",)), ("latestperiodorhighestsystem", ("char",)), ("lifestage", ("char",)), ("lithostratigraphicterms", ("char",)), ("locality", ("char",)), ("locationaccordingto", ("char",)), ("locationid", ("char",)), ("locationremarks", ("char",)), ("lowestbiostratigraphiczone", ("char",)), ("materialsampleid", ("char",)), ("maximumdepthinmeters", ("char",)), ("maximumdistanceabovesurfaceinmeters", ("char",)), ("maximumelevationinmeters", ("char",)), ("member", ("char",)), ("minimumdepthinmeters", ("char",)), ("minimumdistanceabovesurfaceinmeters", ("char",)), ("minimumelevationinmeters", ("char",)), ("month", ("char",)), ("municipality", ("char",)), ("occurrenceid", ("char",)), ("occurrenceremarks", ("char",)), ("occurrencestatus", ("char",)), ("organismid", ("char",)), ("organismname", ("char",)), ("organismremarks", ("char",)), ("organismscope", ("char",)), ("othercatalognumbers", ("char",)), ("pointradiusspatialfit", ("char",)), ("preparations", ("char",)), ("previousidentifications", ("char",)), ("recordedby", ("char",)), ("recordnumber", ("char",)), ("reproductivecondition", ("char",)), ("samplingeffort", ("char",)), ("samplingprotocol", ("char",)), ("sex", ("char",)), ("startdayofyear", ("char",)), ("stateprovince", ("char",)), ("typestatus", ("char",)), ("verbatimcoordinates", ("char",)), ("verbatimcoordinatesystem", ("char",)), ("verbatimdepth", ("char",)), ("verbatimelevation", ("char",)), ("verbatimeventdate", ("char",)), ("verbatimlatitude", ("char",)), ("verbatimlocality", ("char",)), ("verbatimlongitude", ("char",)), ("verbatimsrs", ("char",)), ("waterbody", ("char",)), ("year", ("char",)), ("dctype", ("char",)), ("modified", ("char",)), ("language", ("char",)), ("license", ("char",)), ("rightsholder", ("char",)), ("accessrights", ("char",)), ("bibliographiccitation", ("char",)), ("dc_references", ("char",)), ("institutionid", ("char",)), ("collectionid", ("char",)), ("datasetid", ("char",)), ("institutioncode", ("char",)), ("collectioncode", ("char",)), ("datasetname", ("char",)), ("ownerinstitutioncode", ("char",)), ("basisofrecord", ("char",)), ("informationwithheld", ("char",)), ("datageneralizations", ("char",)), ("dynamicproperties", ("char",)), ("scientificnameid", ("char",)), ("namepublishedinid", ("char",)), ("scientificname", ("char",)), ("acceptednameusage", ("char",)), ("originalnameusage", ("char",)), ("namepublishedin", ("char",)), ("namepublishedinyear", ("char",)), ("higherclassification", ("char",)), ("kingdom", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("genus", ("char",)), ("subgenus", ("char",)), ("specificepithet", ("char",)), ("infraspecificepithet", ("char",)), ("taxonrank", ("char",)), ("verbatimtaxonrank", ("char",)), ("scientificnameauthorship", ("char",)), ("vernacularname", ("char",)), ("nomenclaturalcode", ("char",)), ("taxonomicstatus", ("char",)), ("keyname", ("char",)), ("haslicense", ("int",)), ("vntype", ("char",)), ("rank", ("int",)), ("mappable", ("int",)), ("hashid", ("char",)), ("hastypestatus", ("int",)), ("wascaptive", ("int",)), ("wasinvasive", ("int",)), ("hastissue", ("int",)), ("hasmedia", ("int",)), ("isfossil", ("int",)), ("haslength", ("int",)), ("haslifestage", ("int",)), ("hasmass", ("int",)), ("hassex", ("int",)), ("lengthinmm", ("double",)), ("massing", ("double",)), ("lengthunitsinferred", ("char",)), ("massunitsinferred", ("char",)), ("underivedlifestage", ("char",)), ("underivedsex", ("char",))] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip", archivename="vertnet_latest_" + str(tablename)) engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine csv_files = [] request_src = "http://www.data-retriever.org/" base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}" header_values = ["observation_id", "update_datetime", "site_id", "latitude", "longitude", "elevation_in_meters", "state", "species_id", "genus", "species", "common_name", "kingdom", "individual_id", "phenophase_id", "phenophase_description", "observation_date", "day_of_year", "phenophase_status", "intensity_category_id", "intensity_value", "abundance_value" ] columns = [("record_id", ("pk-auto",)), ("observation_id", ("int",)), # subsequently refered to as "status record" ("update_datetime", ("char",)), ("site_id", ("int",)), ("latitude", ("double",)), ("longitude", ("double",)), ("elevation_in_meters", ("char",)), ("state", ("char",)), ("species_id", ("int",)), ("genus", ("char",)), ("species", ("char",)), ("common_name", ("char",)), ("kingdom", ("char",)), # skip kingdom ("individual_id", ("char",)), ("phenophase_id", ("int",)), ("phenophase_description", ("char",)), ("observation_date", ("char",)), ("day_of_year", ("char",)), ("phenophase_status", ("char",)), ("intensity_category_id", ("char",)), ("intensity_value", ("char",)), ("abundance_value", ("char",)) ] start_date = datetime.date(2009, 1, 1) end_date = datetime.date.today() while start_date < end_date: to_date = start_date + datetime.timedelta(90) if to_date >= end_date: data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date), request_src=request_src) else: data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date), request_src=request_src) xml_file_name = '{}'.format(start_date) + ".xml" engine.download_file(data_url, xml_file_name) # Create csv files for 3 months csv_observation = '{}'.format(start_date) + ".csv" csv_files.append(csv_observation) csv_buff = open_fw(engine.format_filename(csv_observation)) csv_writer = open_csvw(csv_buff) csv_writer.writerow(header_values) # Parse xml to read data file_read = "" fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name with open(fname, 'r') as fp1: file_read = fp1.read() root = ET.fromstring(file_read) for elements in root: index_map = {val: i for i, val in enumerate(header_values)} diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]]) csv_writer.writerow([x[1] for x in diction]) csv_buff.close() start_date = to_date + datetime.timedelta(1) # Create table table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True) table.columns = columns engine.table = table engine.create_table() for data_file in csv_files: engine.insert_data_from_file(engine.find_file(data_file)) return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=11) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"], archive_name="routes.zip") engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table engine.download_files_from_archive(self.urls["weather"], ["weather.csv"], archive_name="weather.zip") engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather.csv") engine.insert_data_from_file(engine.format_filename("weather.csv")) # Migrations data engine.download_files_from_archive( self.urls["migrants"], archive_name="MigrantNonBreeder.zip") engine.extract_zip( engine.format_filename("MigrantNonBreeder/Migrants.zip"), engine.format_filename("Migrant"), ) engine.extract_zip( engine.format_filename("MigrantNonBreeder/MigrantSummary.zip"), engine.format_filename("MigrantSummary"), ) table = Table("migrants", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('aou', ('int', )), ('stop1', ('int', )), ('stop2', ('int', )), ('stop3', ('int', )), ('stop4', ('int', )), ('stop5', ('int', )), ('stop6', ('int', )), ('stop7', ('int', )), ('stop8', ('int', )), ('stop9', ('int', )), ('stop10', ('int', )), ('stop11', ('int', )), ('stop12', ('int', )), ('stop13', ('int', )), ('stop14', ('int', )), ('stop15', ('int', )), ('stop16', ('int', )), ('stop17', ('int', )), ('stop18', ('int', )), ('stop19', ('int', )), ('stop20', ('int', )), ('stop21', ('int', )), ('stop22', ('int', )), ('stop23', ('int', )), ('stop24', ('int', )), ('stop25', ('int', )), ('stop26', ('int', )), ('stop27', ('int', )), ('stop28', ('int', )), ('stop29', ('int', )), ('stop30', ('int', )), ('stop31', ('int', )), ('stop32', ('int', )), ('stop33', ('int', )), ('stop34', ('int', )), ('stop35', ('int', )), ('stop36', ('int', )), ('stop37', ('int', )), ('stop38', ('int', )), ('stop39', ('int', )), ('stop40', ('int', )), ('stop41', ('int', )), ('stop42', ('int', )), ('stop43', ('int', )), ('stop44', ('int', )), ('stop45', ('int', )), ('stop46', ('int', )), ('stop47', ('int', )), ('stop48', ('int', )), ('stop49', ('int', )), ('stop50', ('int', ))] engine.table = table engine.create_table() engine.insert_data_from_file( engine.format_filename("Migrant/Migrants.csv")) table = Table("migrantsummary", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('aou', ('int', )), ('count10', ('int', )), ('count20', ('int', )), ('count30', ('int', )), ('count40', ('int', )), ('count50', ('int', )), ('stoptotal', ('int', )), ('speciestotal', ('int', ))] engine.table = table engine.create_table() engine.insert_data_from_file( engine.format_filename("MigrantSummary/MigrantSummary.csv")) table = Table("vehicledata", cleanup=Cleanup()) table.columns = [('routedataid', ('int', )), ('countrynum', ('int', )), ('statenum', ('int', )), ('route', ('int', )), ('rpid', ('int', )), ('year', ('int', )), ('recordedcar', ('char', )), ('car1', ('int', )), ('car2', ('int', )), ('car3', ('int', )), ('car4', ('int', )), ('car5', ('int', )), ('car6', ('int', )), ('car7', ('int', )), ('car8', ('int', )), ('car9', ('int', )), ('car10', ('int', )), ('car11', ('int', )), ('car12', ('int', )), ('car13', ('int', )), ('car14', ('int', )), ('car15', ('int', )), ('car16', ('int', )), ('car17', ('int', )), ('car18', ('int', )), ('car19', ('int', )), ('car20', ('int', )), ('car21', ('int', )), ('car22', ('int', )), ('car23', ('int', )), ('car24', ('int', )), ('car25', ('int', )), ('car26', ('int', )), ('car27', ('int', )), ('car28', ('int', )), ('car29', ('int', )), ('car30', ('int', )), ('car31', ('int', )), ('car32', ('int', )), ('car33', ('int', )), ('car34', ('int', )), ('car35', ('int', )), ('car36', ('int', )), ('car37', ('int', )), ('car38', ('int', )), ('car39', ('int', )), ('car40', ('int', )), ('car41', ('int', )), ('car42', ('int', )), ('car43', ('int', )), ('car44', ('int', )), ('car45', ('int', )), ('car46', ('int', )), ('car47', ('int', )), ('car48', ('int', )), ('car49', ('int', )), ('car50', ('int', )), ('noise1', ('int', )), ('noise2', ('int', )), ('noise3', ('int', )), ('noise4', ('int', )), ('noise5', ('int', )), ('noise6', ('int', )), ('noise7', ('int', )), ('noise8', ('int', )), ('noise9', ('int', )), ('noise10', ('int', )), ('noise11', ('int', )), ('noise12', ('int', )), ('noise13', ('int', )), ('noise14', ('int', )), ('noise15', ('int', )), ('noise16', ('int', )), ('noise17', ('int', )), ('noise18', ('int', )), ('noise19', ('int', )), ('noise20', ('int', )), ('noise21', ('int', )), ('noise22', ('int', )), ('noise23', ('int', )), ('noise24', ('int', )), ('noise25', ('int', )), ('noise26', ('int', )), ('noise27', ('int', )), ('noise28', ('int', )), ('noise29', ('int', )), ('noise30', ('int', )), ('noise31', ('int', )), ('noise32', ('int', )), ('noise33', ('int', )), ('noise34', ('int', )), ('noise35', ('int', )), ('noise36', ('int', )), ('noise37', ('int', )), ('noise38', ('int', )), ('noise39', ('int', )), ('noise40', ('int', )), ('noise41', ('int', )), ('noise42', ('int', )), ('noise43', ('int', )), ('noise44', ('int', )), ('noise45', ('int', )), ('noise46', ('int', )), ('noise47', ('int', )), ('noise48', ('int', )), ('noise49', ('int', )), ('noise50', ('int', ))] engine.table = table engine.create_table() engine.download_files_from_archive(self.urls["Vehicledata"], archive_name="VehicleData.zip") engine.extract_zip( engine.format_filename("VehicleData/VehicleData.zip"), engine.format_filename("VehicleData"), ) engine.insert_data_from_file( engine.format_filename("VehicleData/VehicleData.csv")) # Counts table table = Table("counts", delimiter=",") engine.download_files_from_archive(self.urls["counts"], archive_name="States.zip") table.columns = [("record_id", ("pk-auto", )), ("RouteDataID", ("int", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("Year", ("int", )), ("Aou", ("int", )), ("Count10", ("int", )), ("Count20", ("int", )), ("Count30", ("int", )), ("Count40", ("int", )), ("Count50", ("int", )), ("StopTotal", ("int", )), ("SpeciesTotal", ("int", ))] stateslist = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon" ] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if isinstance(state, (list, )): state, shortstate = state[0], state[1] else: shortstate = state[0:7] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.extract_zip( engine.format_filename("States/" + shortstate + ".zip"), engine.format_filename("States/" + shortstate), ) file_path = "{states}/{shortstate}/{shortstate}.csv".format( states="States", shortstate=shortstate) engine.insert_data_from_file( engine.format_filename(file_path)) except: print(state, ": Failed bulk insert on, inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_file( engine.format_filename(file_path)) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filenames = ['Aquatic_animal_excretion_data.csv', 'Aquatic_animal_excretion_variable_descriptions.csv'] for file_paths in filenames: if not os.path.isfile(engine.format_filename(file_paths)): url = self.urls["aquatic_animals"] engine.download_files_from_archive(url, filenames, "zip") # processing Aquatic_animal_excretion_data.csv filename = 'Aquatic_animal_excretion_data.csv' tablename = 'aquatic_animals' table = Table(str(tablename), delimiter=',') table.columns = [ ("index", ("pk-int",)), ("sourcenumber", ("int",)), ("sourcename", ("char",)), ("speciesname", ("char",)), ("speciescode", ("char",)), ("invert/vert", ("char",)), ("phylum", ("char",)), ("class", ("char",)), ("order", ("char",)), ("family", ("char",)), ("trophicgild", ("char",)), ("drymass", ("double",)), ("logdrymass", ("double",)), ("ecosystemtype", ("char",)), ("energysource", ("char",)), ("habitat", ("char",)), ("residentecosystem", ("char",)), ("temperature", ("double",)), ("nexcretionrate", ("double",)), ("pexcretionrate", ("double",)), ("lognexcretionrate", ("double",)), ("logpexcretionrate", ("double",)), ("incubationtime", ("double",)), ("nform", ("char",)), ("pform", ("char",)), ("bodyc", ("double",)), ("bodyn", ("double",)), ("bodyp", ("double",)), ("bodyc:n", ("double",)), ("bodyc:p", ("double",)), ("bodyn:p", ("double",)), ("bodydatasource", ("char",)), ("datasource", ("char",)), ("dataproviders", ("char",))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename))) # processing Aquatic_animal_excretion_variable_descriptions.csv filename = 'Aquatic_animal_excretion_variable_descriptions.csv' tablename = 'variable_descriptions' table = Table(str(tablename), delimiter=',') table.columns = [ ("Column", ("char",)), ("Variable", ("char",)), ("Description", ("char",)), ("Data Class", ("char",)), ("Units", ("char",)), ("Minimum_value", ("char",)), ("Maximum_value", ("char",)), ("Possible_values", ("char",)), ("Missing_data_symbol", ("char",)), ("Notes", ("char",))] engine.table = table engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): Script.download(self, engine, debug) engine = self.engine filename = "database.csv" tablename = "predicts_main" table = Table(str(tablename), delimiter=',') table.columns = [ ("Source_ID", ("char", )), ("Reference", ("char", )), ("Study_number", ("int", )), ("Study_name", ("char", )), ("SS", ("char", )), ("Diversity_metric", ("char", )), ("Diversity_metric_unit", ("char", )), ("Diversity_metric_type", ("char", )), ("Diversity_metric_is_effort_sensitive", ("char", )), ("Diversity_metric_is_suitable_for_Chao", ("char", )), ("Sampling_method", ("char", )), ("Sampling_effort_unit", ("char", )), ("Study_common_taxon", ("char", )), ("Rank_of_study_common_taxon", ("char", )), ("Site_number", ("int", )), ("Site_name", ("char", )), ("Block", ("char", )), ("SSS", ("char", )), ("SSB", ("char", )), ("SSBS", ("char", )), ("Sample_start_earliest", ("char", )), ("Sample_end_latest", ("char", )), ("Sample_midpoint", ("char", )), ("Sample_date_resolution", ("char", )), ("Max_linear_extent_metres", ("double", )), ("Habitat_patch_area_square_metres", ("double", )), ("Sampling_effort", ("double", )), ("Rescaled_sampling_effort", ("double", )), ("Habitat_as_described", ("char", )), ("Predominant_land_use", ("char", )), ("Source_for_predominant_land_use", ("char", )), ("Use_intensity", ("char", )), ("Km_to_nearest_edge_of_habitat", ("double", )), ("Years_since_fragmentation_or_conversion", ("double", )), ("Transect_details", ("char", )), ("Coordinates_method", ("char", )), ("Longitude", ("double", )), ("Latitude", ("double", )), ("Country_distance_metres", ("double", )), ("Country", ("char", )), ("UN_subregion", ("char", )), ("UN_region", ("char", )), ("Ecoregion_distance_metres", ("double", )), ("Ecoregion", ("char", )), ("Biome", ("char", )), ("Realm", ("char", )), ("Hotspot", ("char", )), ("Wilderness_area", ("char", )), ("N_samples", ("double", )), ("Taxon_number", ("double", )), ("Taxon_name_entered", ("char", )), ("Indication", ("char", )), ("Parsed_name", ("char", )), ("Taxon", ("char", )), ("COL_ID", ("double", )), ("Name_status", ("char", )), ("Rank", ("char", )), ("Kingdom", ("char", )), ("Phylum", ("char", )), ("Class", ("char", )), ("Order", ("char", )), ("Family", ("char", )), ("Genus", ("char", )), ("Species", ("char", )), ("Best_guess_binomial", ("char", )), ("Higher_taxa", ("char", )), ("Higher_taxon", ("char", )), ("Measurement", ("double", )), ("Effort_corrected_measurement", ("double", )) ] engine.table = table if not os.path.isfile(engine.format_filename(filename)): engine.download_files_from_archive(self.urls["PREDICTS"], [filename], "zip", False, "download.zip") engine.create_table() engine.insert_data_from_file(engine.format_filename(str(filename)))
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns=[("record_id" , ("pk-auto",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("Year" , ("int",) ), ("Aou" , ("int",) ), ("Count10" , ("int",) ), ("Count20" , ("int",) ), ("Count30" , ("int",) ), ("Count40" , ("int",) ), ("Count50" , ("int",) ), ("StopTotal" , ("int",) ), ("SpeciesTotal" , ("int",) )] stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print("There was an unexpected error in the Breeding Bird Survey archives.") raise return engine
def download(self, engine=None, debug=False, ): if engine.name != "Download Only": raise Exception( "The mammal-super-tree dataset contains only non-tabular data files, and can only be used with the 'download only' engine.") Script.download(self, engine, debug)
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv') self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile( self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] # Currently all_Excel.zip is missing CURUYUQU.xls # Download it separately and add it to the file list if not self.engine.find_file('CURUYUQU.xls'): self.engine.download_file( "http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls") filelist.append('CURUYUQU.xls') lines = [] tax = [] for filename in filelist: print("Extracting data from " + filename + "...") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for colnum, c in enumerate(sh.row(0)): if not Excel.empty_cell(c): cid = c.value.lower().strip() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # in QUIAPACA.xls the "number of individuals" column is # misnamed "STEMDBH" just like the stems columns, so weep # for the state of scientific data and then fix manually if filename == "QUIAPACA.xls" and colnum == 13: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid or "dbh" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in list(cn.keys()): cn["liana"] = -1 if not "count" in list(cn.keys()): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if not all(Excel.empty_cell(cell) for cell in row): try: this_line = {} # get the following information from the appropriate columns for i in [ "line", "family", "genus", "species", "liana", "count" ]: if cn[i] > -1: if row[cn[i]].ctype != 2: # if the cell type(ctype) is not a number this_line[i] = row[ cn[i]].value.lower().strip().replace( "\\", "/").replace('"', '') else: this_line[i] = row[cn[i]].value if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [ row[c] for c in cn["stems"] if not Excel.empty_cell(row[c]) ] this_line["site"] = filename[0:-4] # Manually correct CEDRAL data, which has a single line # that is shifted by one to the left starting at Liana if this_line["site"] == "CEDRAL" and type( this_line["liana"]) == float: this_line["liana"] = "" this_line["count"] = 3 this_line["stems"] = [2.5, 2.5, 30, 18, 25] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append( (this_line["family"], this_line["genus"], this_line["species"], id_level, str(full_id))) except: raise pass tax = sorted( tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = {} tax_count = 0 # Get all unique families/genera/species print("\n") for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str( tax_count) + " / " + str(TAX_GROUPS) sys.stdout.flush() sys.stdout.write(msg + "\b" * len(msg)) print("\n") # Create species table table = Table("species", delimiter=",") table.columns = [("species_id", ("pk-int", )), ("family", ("char", )), ("genus", ("char", )), ("species", ("char", )), ("id_level", ("char", 10)), ("full_id", ("int", ))] data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group] for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",") table.columns = [("stem_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("stem", ("double", ))] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [ line["line"], tax_dict[(line["family"], line["genus"], line["species"])], line["site"], liana ] try: counts.append( [value for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [str(i)] stems.append(stem) self.engine.table = table self.engine.create_table() self.engine.add_to_table(stems) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns = [("count_id", ("pk-auto", )), ("line", ("int", )), ("species_id", ("int", )), ("site_code", ("char", 12)), ("liana", ("char", 10)), ("count", ("double", ))] self.engine.table = table self.engine.create_table() self.engine.add_to_table(counts) return self.engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=6) table.columns=[("species_id", ("pk-int",) ), ("AOU", ("int",) ), ("english_common_name", ("char",50) ), ("french_common_name", ("char",50) ), ("spanish_common_name", ("char",50) ), ("sporder", ("char",30) ), ("family", ("char",30) ), ("genus", ("char",30) ), ("species", ("char",50) ), ] table.fixed_width = [7,6,51,51,51,51,51,51,50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table if not os.path.isfile(engine.format_filename("routes_new.csv")): engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) read = open(engine.format_filename("routes.csv"), "rb") write = open(engine.format_filename("routes_new.csv"), "wb") print "Cleaning routes data..." write.write(read.readline()) for line in read: values = line.split(',') v = Decimal(values[5]) if v > 0: values[5] = str(v * Decimal("-1")) write.write(','.join(str(value) for value in values)) write.close() read.close() engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes_new.csv") engine.insert_data_from_file(engine.format_filename("routes_new.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open(engine.format_filename("weather.csv"), "rb") write = open(engine.format_filename("weather_new.csv"), "wb") print "Cleaning weather data..." for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()), filename="weather_new.csv") engine.insert_data_from_file(engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"} newvalue = str(value) for key in replace.keys(): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns=[("countrynum" , ("int",) ), ("regioncode" , ("int",) ), ("regionname" , ("char",30) )] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", pk=False, delimiter=',') table.columns=[("RouteDataID" , ("int",) ), ("countrynum" , ("int",) ), ("statenum" , ("int",) ), ("Route" , ("int",) ), ("RPID" , ("int",) ), ("year" , ("int",) ), ("AOU" , ("int",) ), ("Stop1" , ("int",) ), ("Stop2" , ("int",) ), ("Stop3" , ("int",) ), ("Stop4" , ("int",) ), ("Stop5" , ("int",) ), ("Stop6" , ("int",) ), ("Stop7" , ("int",) ), ("Stop8" , ("int",) ), ("Stop9" , ("int",) ), ("Stop10" , ("int",) ), ("Stop11" , ("int",) ), ("Stop12" , ("int",) ), ("Stop13" , ("int",) ), ("Stop14" , ("int",) ), ("Stop15" , ("int",) ), ("Stop16" , ("int",) ), ("Stop17" , ("int",) ), ("Stop18" , ("int",) ), ("Stop19" , ("int",) ), ("Stop20" , ("int",) ), ("Stop21" , ("int",) ), ("Stop22" , ("int",) ), ("Stop23" , ("int",) ), ("Stop24" , ("int",) ), ("Stop25" , ("int",) ), ("Stop26" , ("int",) ), ("Stop27" , ("int",) ), ("Stop28" , ("int",) ), ("Stop29" , ("int",) ), ("Stop30" , ("int",) ), ("Stop31" , ("int",) ), ("Stop32" , ("int",) ), ("Stop33" , ("int",) ), ("Stop34" , ("int",) ), ("Stop35" , ("int",) ), ("Stop36" , ("int",) ), ("Stop37" , ("int",) ), ("Stop38" , ("int",) ), ("Stop39" , ("int",) ), ("Stop40" , ("int",) ), ("Stop41" , ("int",) ), ("Stop42" , ("int",) ), ("Stop43" , ("int",) ), ("Stop44" , ("int",) ), ("Stop45" , ("int",) ), ("Stop46" , ("int",) ), ("Stop47" , ("int",) ), ("Stop48" , ("int",) ), ("Stop49" , ("int",) ), ("Stop50" , ("int",) )] part = "" engine.table = table engine.create_table() for part in range(1,11): part = str(part) try: print "Inserting data from part " + part + "..." try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "Failed bulk insert on " + part + ", inserting manually." engine.connection.rollback() engine.table.cleanup = Cleanup(correct_invalid_value, nulls=['*']) engine.insert_data_from_archive(self.urls["counts"] + "Fifty" + part + ".zip", ["fifty" + part + ".csv"]) except: print "There was an error in part " + part + "." raise except zipfile.BadZipfile: print "There was an unexpected error in the Breeding Bird Survey archives." raise return engine
def download(self, engine=None, debug=False): try: Script.download(self, engine, debug) engine = self.engine # Species table table = Table("species", cleanup=Cleanup(), contains_pk=True, header_rows=9) table.columns = [ ("species_id", ("pk-int", )), ("AOU", ("int", )), ("english_common_name", ("char", 50)), ("french_common_name", ("char", 50)), ("spanish_common_name", ("char", 50)), ("sporder", ("char", 30)), ("family", ("char", 30)), ("genus", ("char", 30)), ("species", ("char", 50)), ] table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["species"]) # Routes table engine.download_files_from_archive(self.urls["routes"], ["routes.csv"]) engine.auto_create_table(Table("routes", cleanup=Cleanup()), filename="routes.csv") engine.insert_data_from_file(engine.format_filename("routes.csv")) # Weather table if not os.path.isfile(engine.format_filename("weather_new.csv")): engine.download_files_from_archive(self.urls["weather"], ["weather.csv"]) read = open_fr(engine.format_filename("weather.csv")) write = open_fw(engine.format_filename("weather_new.csv")) print("Cleaning weather data...") for line in read: values = line.split(',') newvalues = [] for value in values: if ':' in value: newvalues.append(value.replace(':', '')) elif value == "N": newvalues.append(None) else: newvalues.append(value) write.write(','.join(str(value) for value in newvalues)) write.close() read.close() engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=self.cleanup_func_table), filename="weather_new.csv") engine.insert_data_from_file( engine.format_filename("weather_new.csv")) # Region_codes table table = Table("region_codes", pk=False, header_rows=11, fixed_width=[11, 11, 30]) def regioncodes_cleanup(value, engine): replace = { chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o" } newvalue = str(value) for key in list(replace.keys()): if key in newvalue: newvalue = newvalue.replace(key, replace[key]) return newvalue table.cleanup = Cleanup(regioncodes_cleanup) table.columns = [("countrynum", ("int", )), ("regioncode", ("int", )), ("regionname", ("char", 30))] engine.table = table engine.create_table() engine.insert_data_from_url(self.urls["region_codes"]) # Counts table table = Table("counts", delimiter=',') table.columns = [("record_id", ("pk-auto", )), ("countrynum", ("int", )), ("statenum", ("int", )), ("Route", ("int", )), ("RPID", ("int", )), ("Year", ("int", )), ("Aou", ("int", )), ("Count10", ("int", )), ("Count20", ("int", )), ("Count30", ("int", )), ("Count40", ("int", )), ("Count50", ("int", )), ("StopTotal", ("int", )), ("SpeciesTotal", ("int", ))] stateslist = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"], ["New Mexico", "NMexico"], ["New York", "NYork"], ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta", ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"], ["Northwest Territories", "NWTerri"], "Newfoundland", ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario", ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon" ] state = "" shortstate = "" engine.table = table engine.create_table() for state in stateslist: try: if len(state) > 2: shortstate = state[0:7] else: state, shortstate = state[0], state[1] print("Inserting data from " + state + "...") try: engine.table.cleanup = Cleanup() engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("Failed bulk insert on " + state + ", inserting manually.") engine.connection.rollback() engine.table.cleanup = self.cleanup_func_clean engine.insert_data_from_archive( self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"]) except: print("There was an error in " + state + ".") raise except zipfile.BadZipfile: print( "There was an unexpected error in the Breeding Bird Survey archives." ) raise return engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) reload(sys) if hasattr(sys, 'setdefaultencoding'): sys.setdefaultencoding("utf-8") self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls") filename = os.path.basename("GlobalWoodDensityDatabase.xls") book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(1) rows = sh.nrows # Creating data files file_path = self.engine.format_filename("gwdd_data.csv") gwdd_data = open_fw(file_path) csv_writer = open_csvw(gwdd_data) csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"]) for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value) for column_value in row] csv_writer.writerow(row_as_list) gwdd_data.close() table = Table("data", delimiter=",") table.columns = [("Number", ("pk-int",)), ("Family", ("char",)), ("Binomial", ("char",)), ("Wood_Density", ("double",)), ("Region", ("char",)), ("Reference_Number", ("int",))] table.pk = 'Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) # Creating reference tale file file_path = self.engine.format_filename("gwdd_ref.csv") ref_file = open_fw(file_path) csv_writerd = open_csvw(ref_file) csv_writerd.writerow(["Reference_Number", "Reference"]) sh = book.sheet_by_index(2) rows = sh.nrows for index in range(1, rows): row = sh.row(index) # get each row and format the sell value. row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row] csv_writerd.writerow(row_as_list) ref_file.close() table = Table("reference", delimiter=",") table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))] table.pk = 'Reference_Number' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.insert_data_from_file(engine.format_filename(file_path)) return self.engine
def download(self, engine=None, debug=False): Script.download(self, engine, debug) self.engine.auto_create_table(Table("sites"), url=self.urls["sites"]) self.engine.insert_data_from_url(self.urls["sites"]) self.engine.download_file(self.urls["stems"], "all_Excel.zip") local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip")) filelist = local_zip.namelist() local_zip.close() self.engine.download_files_from_archive(self.urls["stems"], filelist) filelist = [os.path.basename(filename) for filename in filelist] lines = [] tax = [] for filename in filelist: print "Extracting data from " + filename + "..." book = xlrd.open_workbook(self.engine.format_filename(filename)) sh = book.sheet_by_index(0) rows = sh.nrows cn = {'stems': []} n = 0 for c in sh.row(0): if not Excel.empty_cell(c): cid = Excel.cell_value(c).lower() # line number column is sometimes named differently if cid in ["sub", "number"]: cid = "line" # the "number of individuals" column is named in various # different ways; they always at least contain "nd" if "nd" in cid: cid = "count" # if column is a stem, add it to the list of stems; # otherwise, make note of the column name/number if "stem" in cid: cn["stems"].append(n) else: cn[cid] = n n += 1 # sometimes, a data file does not contain a liana or count column if not "liana" in cn.keys(): cn["liana"] = -1 if not "count" in cn.keys(): cn["count"] = -1 for i in range(1, rows): row = sh.row(i) cellcount = len(row) # make sure the row is real, not just empty cells if cellcount > 4 and not Excel.empty_cell(row[0]): try: this_line = {} def format_value(s): s = Excel.cell_value(s) return str(s).title().replace("\\", "/").replace('"', '') # get the following information from the appropriate columns for i in ["line", "family", "genus", "species", "liana", "count"]: if cn[i] > -1: this_line[i] = format_value(row[cn[i]]) if this_line[i] == '`': this_line[i] = 1 this_line["stems"] = [Excel.cell_value(row[c]) for c in cn["stems"] if not Excel.empty_cell(row[c])] this_line["site"] = filename[0:-4] lines.append(this_line) # Check how far the species is identified full_id = 0 if len(this_line["species"]) < 3: if len(this_line["genus"]) < 3: id_level = "family" else: id_level = "genus" else: id_level = "species" full_id = 1 tax.append((this_line["family"], this_line["genus"], this_line["species"].lower().replace('\\', '').replace('"', ''), id_level, str(full_id))) except: raise pass tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2]) unique_tax = [] tax_dict = dict() tax_count = 0 # Get all unique families/genera/species for group in tax: if not (group in unique_tax): unique_tax.append(group) tax_count += 1 tax_dict[group[0:3]] = tax_count if tax_count % 10 == 0: msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS) sys.stdout.write(msg + "\b" * len(msg)) print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS) # Create species table table = Table("species", delimiter=",") table.columns=[("species_id" , ("pk-int",) ), ("family" , ("char", ) ), ("genus" , ("char", ) ), ("species" , ("char", ) ), ("id_level" , ("char", 10) ), ("full_id" , ("bool",) )] data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) for group in unique_tax] table.pk = 'species_id' table.contains_pk = True self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create stems table table = Table("stems", delimiter=",", contains_pk=False) table.columns=[("stem_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("stem" , ("double",) )] stems = [] counts = [] for line in lines: try: liana = line["liana"] except KeyError: liana = "" species_info = [line["line"], tax_dict[(line["family"], line["genus"], line["species"].lower())], line["site"], liana ] try: counts.append([str(value) for value in species_info + [line["count"]]]) except KeyError: pass for i in line["stems"]: stem = species_info + [i] stems.append([str(value) for value in stem]) data = [','.join(stem) for stem in stems] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) # Create counts table table = Table("counts", delimiter=",", contains_pk=False) table.columns=[("count_id" , ("pk-auto",) ), ("line" , ("int",) ), ("species_id" , ("int",) ), ("site_code" , ("char", 12) ), ("liana" , ("char", 10) ), ("count" , ("double",) )] data = [','.join(count) for count in counts] self.engine.table = table self.engine.create_table() self.engine.add_to_table(data) return self.engine