Ejemplo n.º 1
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)        
        engine = self.engine
        
        # download and create species table
        table = Table('species')
        self.engine.auto_create_table(table, url=self.urls['species'])
        self.engine.insert_data_from_url(self.urls['species'])
        
        # State abbreviations with the year annual inventory began for that state
        stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000), 
                      ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004), 
                      ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001), 
                      ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999), 
                      ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003), 
                      ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MT', 2003), 
                      ('NE', 2001), ('NV', 2004), ('NH', 2002), ('NJ', 2004), 
                      ('NY', 2002), ('NC', 2003), ('ND', 2001), ('OH', 2001), 
                      ('OK', 2008), ('OR', 2001), ('PA', 2000), ('RI', 2003), 
                      ('SC', 1999), ('SD', 2001), ('TN', 2000), ('TX', 2001), 
                      ('UT', 2000), ('VT', 2003), ('VA', 1998), ('WA', 2002), 
                      ('WV', 2004), ('WI', 2000), ('PR', 2001)]
        
        tablelist = ["SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE", "SEEDLING"]
        
        for table in tablelist:
            for state, year in stateslist:
                engine.download_files_from_archive(self.urls["main"] + state + "_" + table + ".ZIP", 
                                                   [state + "_" + table + ".CSV"])
        
        for table in tablelist:
            print "Scanning data for table %s..." % table
            prep_file_name = "%s.csv" % table
            prep_file = open(engine.format_filename(prep_file_name), "wb")
            this_file = open(engine.format_filename(stateslist[0][0] + "_" + table + ".CSV"), "rb")
            col_names = this_file.readline()
            prep_file.write(col_names)
            column_names = [col.strip('"') for col in col_names.split(',')]
            year_column = column_names.index("INVYR")            
            this_file.close()
            
            for state, year in stateslist:
                this_file = open(engine.format_filename(state + "_" + table + ".CSV"), "rb")
                this_file.readline()
                for line in this_file:
                    values = line.split(',')
                    this_year = values[year_column]
                    if int(this_year) >= year:
                        prep_file.write(line)
            prep_file.close()
            engine.auto_create_table(Table(table), filename=prep_file_name)

            engine.insert_data_from_file(engine.format_filename(prep_file_name))
            
            try:
                os.remove(engine.format_filename(prep_file_name))
            except:
                pass
        
        return engine
Ejemplo n.º 2
0
 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     self.engine.download_file(self.urls["trees"], "LS_trees_1983_2000.txt")
     data_path = self.engine.format_filename("LS_trees_1983_2000.txt")
     self.engine.auto_create_table(self.tables["trees"],
                                   filename="LS_trees_1983_2000.txt")
     self.engine.insert_data_from_file(data_path)
Ejemplo n.º 3
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(
            self.urls["main"], "Succession_sampling_03-07_data_original.txt")
        data_path = self.engine.format_filename(
            "Succession_sampling_03-07_data.txt")
        old_data = open(
            self.engine.find_file(
                "Succession_sampling_03-07_data_original.txt"), 'rb')
        new_data = open(data_path, 'wb')

        line1 = old_data.readline()
        line2 = old_data.readline()
        newline = line1.replace("\n", "\t") + line2
        new_data.write(newline)

        for line in old_data:
            new_data.write(line)

        new_data.close()
        old_data.close()

        self.engine.auto_create_table(
            self.tables["main"], filename="Succession_sampling_03-07_data.txt")
        self.engine.insert_data_from_file(data_path)
Ejemplo n.º 4
0
    def download(self, engine=None, debug=False):
        if engine.name != "Download Only":
            raise Exception(
                "The PRISM dataset contains only non-tabular data files, and can only be used with the 'download only' engine."
            )
        Script.download(self, engine, debug)

        clim_vars = ['ppt', 'tmax', 'tmean', 'tmin']
        years = list(range(1981, 2015))
        months = ["{:02d}".format(i) for i in range(1, 13)]
        for clim_var in clim_vars:
            mval = "M3" if clim_var == 'ppt' else "M2"
            for year in years:
                for month in months:
                    file_names = self.get_file_names(clim_var, mval, year,
                                                     month)
                    file_url = urllib.parse.urljoin(
                        self.urls["climate"],
                        "{}/{}{}".format(clim_var, year, month))
                    archivename = "PRISM_{}_stable_4km{}_{}{}_bil.zip".format(
                        clim_var, mval, year, month)
                    self.engine.download_files_from_archive(
                        file_url,
                        file_names,
                        archivename=archivename,
                        keep_in_dir=True)
                    self.engine.register_files(file_names)
Ejemplo n.º 5
0
 def download(self, engine=None, debug=False):
     if engine.name != "Download Only":
         raise Exception("The Bioclim dataset contains only non-tabular data files, and can only be used with the 'download only' engine.")
     Script.download(self, engine, debug)
     file_names = ["bio%s.bil" % file_num for file_num in range(1, 20)]
     self.engine.download_files_from_archive(self.urls["climate"], file_names)
     self.engine.register_files(file_names)
Ejemplo n.º 6
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        for key in self.urls:
            original_file_name = "trade_prdct_{}.txt".format(key)
            new_file_name = "trade_prdct_{}.csv".format(key)

            engine.download_file(self.urls[key], original_file_name)

            old_path = self.engine.format_filename(original_file_name)
            new_path = self.engine.format_filename(new_file_name)

            # Re-write the file with one delimeter
            old_data = open_fr(old_path)
            new_data = open_fw(new_path)

            # Read header line and convert "," to "|"
            line1 = old_data.readline().strip().replace(",", "|")
            new_data.write(line1 + "\n")
            for line in old_data:
                # Remove leading "|" from the data
                new_data.write(line.strip("|"))
            new_data.close()
            old_data.close()
            table = Table(key, delimiter="|")
            engine.auto_create_table(table, filename=new_file_name)
            engine.insert_data_from_file(new_path)
Ejemplo n.º 7
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # files are nested in another baad_data folder
        # important files considered (baad_data.csv,baad_methods.csv)
        # relevant files can be added in the same manner

        file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"]
        engine.download_files_from_archive(self.urls["BAAD"], file_names)

        # creating data from baad_data.csv
        engine.auto_create_table(Table("data",
                                       cleanup=Cleanup(correct_invalid_value,
                                                       nulls=['NA'])),
                                 filename="baad_data.csv")
        engine.insert_data_from_file(engine.format_filename("baad_data.csv"))

        # creating methods from baad_methods.csv
        engine.auto_create_table(Table("methods",
                                       cleanup=Cleanup(correct_invalid_value,
                                                       nulls=['NA'])),
                                 filename="baad_methods.csv")
        engine.insert_data_from_file(
            engine.format_filename("baad_methods.csv"))
Ejemplo n.º 8
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key],
                                      self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(
                self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(
                key, cleanup=self.cleanup_func_table),
                                          filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filenames = [
            'Aquatic_animal_excretion_data.csv',
            'Aquatic_animal_excretion_variable_descriptions.csv'
        ]
        for file_paths in filenames:
            if not os.path.isfile(engine.format_filename(file_paths)):
                url = self.urls["aquatic_animals"]
                engine.download_files_from_archive(url, filenames, "zip")

        # processing Aquatic_animal_excretion_data.csv
        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'
        table = Table(str(tablename), delimiter=',')
        table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )),
                         ("sourcename", ("char", )),
                         ("speciesname", ("char", )),
                         ("speciescode", ("char", )),
                         ("invert/vert", ("char", )), ("phylum", ("char", )),
                         ("class", ("char", )), ("order", ("char", )),
                         ("family", ("char", )), ("trophicgild", ("char", )),
                         ("drymass", ("double", )),
                         ("logdrymass", ("double", )),
                         ("ecosystemtype", ("char", )),
                         ("energysource", ("char", )), ("habitat", ("char", )),
                         ("residentecosystem", ("char", )),
                         ("temperature", ("double", )),
                         ("nexcretionrate", ("double", )),
                         ("pexcretionrate", ("double", )),
                         ("lognexcretionrate", ("double", )),
                         ("logpexcretionrate", ("double", )),
                         ("incubationtime", ("double", )),
                         ("nform", ("char", )), ("pform", ("char", )),
                         ("bodyc", ("double", )), ("bodyn", ("double", )),
                         ("bodyp", ("double", )), ("bodyc:n", ("double", )),
                         ("bodyc:p", ("double", )), ("bodyn:p", ("double", )),
                         ("bodydatasource", ("char", )),
                         ("datasource", ("char", )),
                         ("dataproviders", ("char", ))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

        # processing Aquatic_animal_excretion_variable_descriptions.csv
        filename = 'Aquatic_animal_excretion_variable_descriptions.csv'
        tablename = 'variable_descriptions'
        table = Table(str(tablename), delimiter=',')
        table.columns = [("Column", ("char", )), ("Variable", ("char", )),
                         ("Description", ("char", )),
                         ("Data Class", ("char", )), ("Units", ("char", )),
                         ("Minimum_value", ("char", )),
                         ("Maximum_value", ("char", )),
                         ("Possible_values", ("char", )),
                         ("Missing_data_symbol", ("char", )),
                         ("Notes", ("char", ))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Ejemplo n.º 10
0
 def download(self, engine=None, debug=False):
     data_file_name = "eBird_Observation_Dataset_2013.csv"
     Script.download(self, engine, debug)
     self.engine.download_files_from_archive(self.urls["main"],
                                             [data_file_name],
                                             filetype='gz')
     table = (Table("main", delimiter=","))
     table.columns = [("BASISOFRECORD", ("char", )),
                      ("INSTITUTIONCODE", ("char", )),
                      ("COLLECTIONCODE", ("char", )),
                      ("CATALOGNUMBER", ("char", )),
                      ("OCCURRENCEID", ("char", )),
                      ("RECORDEDBY", ("char", )), ("YEAR", ("int", )),
                      ("MONTH", ("int", )), ("DAY", ("int", )),
                      ("COUNTRY", ("char", )),
                      ("STATEPROVINCE", ("char", )), ("COUNTY", ("char", )),
                      ("DECIMALLATITUDE", ("double", )),
                      ("DECIMALLONGITUDE", ("double", )),
                      ("LOCALITY", ("char", )), ("KINGDOM", ("char", )),
                      ("PHYLUM", ("char", )), ("CLASS", ("char", )),
                      ("SPORDER", ("char", )), ("FAMILY", ("char", )),
                      ("GENUS", ("char", )),
                      ("SPECIFICEPITHET", ("char", )),
                      ("SCIENTIFICNAME", ("char", )),
                      ("VERNACULARNAME", ("char", )),
                      ("INDIVIDUALCOUNT", ("int", ))]
     engine.table = table
     engine.create_table()
     engine.insert_data_from_file(engine.format_filename(data_file_name))
     return engine
Ejemplo n.º 11
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        for key in self.urls:
            original_file_name = "trade_prdct_{}.txt".format(key)
            new_file_name = "trade_prdct_{}.csv".format(key)

            engine.download_file(self.urls[key], original_file_name)

            old_path = self.engine.format_filename(original_file_name)
            new_path = self.engine.format_filename(new_file_name)

            # Re-write the file with one delimeter
            old_data = open_fr(old_path)
            new_data = open_fw(new_path)

            # Read header line and convert "," to "|"
            line1 = old_data.readline().strip().replace(",", "|")
            new_data.write(line1 + "\n")
            for line in old_data:
                # Remove leading "|" from the data
                new_data.write(line.strip("|"))
            new_data.close()
            old_data.close()
            table = Table(key, delimiter="|")
            engine.auto_create_table(table, filename=new_file_name)
            engine.insert_data_from_file(new_path)
Ejemplo n.º 12
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(
            self.urls["main"], "Succession_sampling_03-07_data_original.txt")
        data_path = self.engine.format_filename(
            "Succession_sampling_03-07_data.txt")
        old_data = open_fr(
            self.engine.find_file(
                "Succession_sampling_03-07_data_original.txt"))
        new_data = open_fw(data_path)
        # original file's header contains an end of line charactor in the middle hence creating two lines
        # Read in the two lines and create the full header
        line1 = old_data.readline().strip()
        line2 = old_data.readline()
        newline = line1 + "\t" + line2
        new_data.write(newline)
        for line in old_data:
            new_data.write(line)
        new_data.close()
        old_data.close()

        self.engine.auto_create_table(
            self.tables["main"], filename="Succession_sampling_03-07_data.txt")
        self.engine.insert_data_from_file(data_path)
Ejemplo n.º 13
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        # structure_plot_year table
        self.engine.auto_create_table(Table("structure_plot_year"), url=self.urls["structure_plot_year"])
        self.engine.insert_data_from_url(self.urls["structure_plot_year"])

        # structure_plot_year table
        self.engine.auto_create_table(Table("plots"), url=self.urls["plots"])
        self.engine.insert_data_from_url(self.urls["plots"])

        # species table
        self.engine.download_file(self.urls["species"], "original_MSH_SPECIES_DESCRIPTORS.csv")
        data_path = self.engine.format_filename("MSH_SPECIES_DESCRIPTORS.csv")

        old_data = os.path.normpath(self.engine.find_file("original_MSH_SPECIES_DESCRIPTORS.csv"))

        with open(old_data, 'rU') as infile, open(data_path, 'w')as new_data:
            for line in infile:
                line = str(line).encode('utf-8')
                new_data.write(line)
        infile.close()
        new_data.close()

        self.engine.auto_create_table(Table("species"),
                                      filename="MSH_SPECIES_DESCRIPTORS.csv")
        self.engine.insert_data_from_file(data_path)

        # species_plot_year tables
        table = Table("species_plot_year")
        table.delimiter = ','
        table.columns = [
            ('record_id', ('pk-auto',)),
            ('plot_id_year', ('char',)),
            ('plot_name', ('char',)),
            ('plot_number', ('int',)),
            ('year', ('int',)),
            ('species', ('ct_column',)),
            ('count', ('ct-double',))
        ]

        table.ct_column = 'species'
        table.ct_names = ['Abilas', 'Abipro', 'Achmil', 'Achocc', 'Agoaur', 'Agrexa', 'Agrpal', 'Agrsca', 'Alnvir',
                          'Anamar', 'Antmic', 'Antros', 'Aqifor', 'Arcnev', 'Arnlat', 'Astled', 'Athdis', 'Blespi',
                          'Brocar', 'Brosit', 'Carmer', 'Carmic', 'Carpac', 'Carpay', 'Carpha', 'Carros', 'Carspe',
                          'Casmin', 'Chaang', 'Cirarv', 'Cisumb', 'Crycas', 'Danint', 'Descae', 'Elyely', 'Epiana',
                          'Eriova', 'Eripyr', 'Fesocc', 'Fravir', 'Gencal', 'Hiealb', 'Hiegra', 'Hyprad', 'Junmer',
                          'Junpar', 'Juncom', 'Leppun', 'Lommar', 'Luepec', 'Luihyp', 'Luplat', 'Luplep', 'Luzpar',
                          'Maiste', 'Pencar', 'Pencon', 'Penser', 'Phahas', 'Phlalp', 'Phldif', 'Phyemp', 'Pincon',
                          'Poasec', 'Poldav', 'Polmin', 'Pollon', 'Poljun', 'Popbal', 'Potarg', 'Psemen', 'Raccan',
                          'Rumace', 'Salsit', 'Saxfer', 'Senspp', 'Sibpro', 'Sorsit', 'Spiden', 'Trispi', 'Tsumer',
                          'Vacmem', 'Vervir', 'Vioadu', 'Xerten']

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_url(self.urls["species_plot_year"])
Ejemplo n.º 14
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"], ["PanTHERIA_1-0_WR05_Aug2008.txt"],
                                           filetype="zip")

        # Create table Species
        engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table),
                                 filename="PanTHERIA_1-0_WR05_Aug2008.txt")
        engine.insert_data_from_file(engine.format_filename("PanTHERIA_1-0_WR05_Aug2008.txt"))
Ejemplo n.º 15
0
 def download(self, engine=None, debug=False):
     if engine.name != "Download Only":
         raise Exception("The Bioclim dataset contains only non-tabular data files, and can only be used with the 'download only' engine.")
     Script.download(self, engine, debug)
     file_names = []
     for file_num in range(1, 20):
         for ext in (['bil', 'hdr']):
             file_names += ["bio{0}.{1}".format(file_num, ext)]
     self.engine.download_files_from_archive(self.urls["climate"], file_names)
     self.engine.register_files(file_names)
 def download(
     self,
     engine=None,
     debug=False,
 ):
     if engine.name != "Download Only":
         raise Exception(
             "The mammal-super-tree dataset contains only non-tabular data files, and can only be used with the 'download only' engine."
         )
     Script.download(self, engine, debug)
Ejemplo n.º 17
0
 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     engine = self.engine
     file_name = "PanTHERIA_1-0_WR05_Aug2008.txt"
     engine.download_files_from_archive(self.urls["data"], [file_name],
                                        "zip")
     # Create table Species
     engine.auto_create_table(Table('species',
                                    cleanup=self.cleanup_func_table),
                              filename=file_name)
     engine.insert_data_from_file(engine.format_filename(file_name))
Ejemplo n.º 18
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"],
                                           ["Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt"],
                                           filetype="zip")

        # Create table Species
        engine.auto_create_table(Table('main', cleanup=self.cleanup_func_table),
                                 filename="Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt")
        engine.insert_data_from_file(
            engine.format_filename("Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt"))
Ejemplo n.º 19
0
 def download(self, engine=None, debug=False):
     if engine.name != "Download Only":
         raise Exception("The Bioclim dataset contains only "
                         "non-tabular data files, and can only "
                         "be used with the 'download only' engine.")
     Script.download(self, engine, debug)
     file_names = []
     for file_num in range(1, 20):
         for ext in (['bil', 'hdr']):
             file_names += ["bio{0}.{1}".format(file_num, ext)]
     self.engine.download_files_from_archive(self.urls["climate"], file_names)
     self.engine.register_files(file_names)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filename = "Predator_and_prey_body_sizes_in_marine_food_webs_vsn4.txt"
        engine.download_files_from_archive(self.urls["data"], [filename],
                                           filetype="zip")

        # Create table Species

        engine.auto_create_table(Table('main',
                                       cleanup=self.cleanup_func_table),
                                 filename=filename)
        engine.insert_data_from_file(engine.format_filename(filename))
Ejemplo n.º 21
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        file_names = [ ('Flensburg_Data_Links.csv','links'),
                        ('Flensburg_Data_Nodes.csv','nodes')
                     ]

        engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], filetype="zip", archivename="ECOL_92_174")
        
        for(filename,tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            self.engine.auto_create_table(Table(str(tablename), cleanup=self.cleanup_func_table),filename=filename)
            self.engine.insert_data_from_file(data_path)
Ejemplo n.º 22
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"],
                                           ["PanTHERIA_1-0_WR05_Aug2008.txt"],
                                           filetype="zip")

        # Create table Species
        engine.auto_create_table(Table('species',
                                       cleanup=Cleanup(correct_invalid_value,
                                                       nulls=['NA'])),
                                 filename="PanTHERIA_1-0_WR05_Aug2008.txt")
        engine.insert_data_from_file(
            engine.format_filename("PanTHERIA_1-0_WR05_Aug2008.txt"))
Ejemplo n.º 23
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("index", ("pk-int",)),
            ("sourcenumber", ("int",)),
            ("sourcename", ("char",)),
            ("speciesname", ("char",)),
            ("speciescode", ("char",)),
            ("invert/vert", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("trophicgild", ("char",)),
            ("drymass", ("double",)),
            ("logdrymass", ("double",)),
            ("ecosystemtype", ("char",)),
            ("energysource", ("char",)),
            ("habitat", ("char",)),
            ("residentecosystem", ("char",)),
            ("temperature", ("double",)),
            ("nexcretionrate", ("double",)),
            ("pexcretionrate", ("double",)),
            ("lognexcretionrate", ("double",)),
            ("logpexcretionrate", ("double",)),
            ("incubationtime", ("double",)),
            ("nform", ("char",)),
            ("pform", ("char",)),
            ("bodyc", ("double",)),
            ("bodyn", ("double",)),
            ("bodyp", ("double",)),
            ("bodyc:n", ("double",)),
            ("bodyc:p", ("double",)),
            ("bodyn:p", ("double",)),
            ("bodydatasource", ("char",)),
            ("datasource", ("char",)),
            ("dataproviders", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip")

        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Ejemplo n.º 24
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        file_names = [ ('isotopes.csv','isotopes'),
                        ('sources.csv','sources'), 
                        ('diet.csv', 'diet')
                     ]

        engine.download_files_from_archive(self.urls["zip"], [i[0] for i in file_names], filetype="zip", archivename="ECOL_92_97")
        
        for(filename,tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            self.engine.auto_create_table(Table(str(tablename), cleanup=self.cleanup_func_table),filename=filename)
            self.engine.insert_data_from_file(data_path)
Ejemplo n.º 25
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"], ["UPSP_Demo_data.txt", "UPSP_Species_list2.txt"],
                                           filetype="zip")

        # Create table sp_list(Species)
        engine.auto_create_table(Table('sp_list', cleanup=self.cleanup_func_table),
                                 filename="UPSP_Species_list2.txt")
        engine.insert_data_from_file(engine.format_filename("UPSP_Species_list2.txt"))

        # Create table ind_loc_girth
        engine.auto_create_table(Table('ind_loc_girth', cleanup=self.cleanup_func_table),
                                 filename="UPSP_Demo_data.txt")
        engine.insert_data_from_file(engine.format_filename("UPSP_Demo_data.txt"))
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        file_names = [('Flensburg_Data_Links.csv', 'links'),
                      ('Flensburg_Data_Nodes.csv', 'nodes')]

        engine.download_files_from_archive(self.urls["zip"],
                                           [i[0] for i in file_names], "zip",
                                           False, "ECOL_92_174")

        for (filename, tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            self.engine.auto_create_table(Table(
                str(tablename), cleanup=self.cleanup_func_table),
                                          filename=filename)
            self.engine.insert_data_from_file(data_path)
Ejemplo n.º 27
0
    def download(self, engine=None, debug=False):
        if engine.name != "Download Only":
            raise Exception("The PRISM dataset contains only non-tabular data files, and can only be used with the 'download only' engine.")
        Script.download(self, engine, debug)

        clim_vars = ['ppt', 'tmax', 'tmean', 'tmin']
        years = list(range(1981, 2015))
        months = ["{:02d}".format(i) for i in range(1,13)]
        for clim_var in clim_vars:
            mval = "M3" if clim_var == 'ppt' else "M2"
            for year in years:
                for month in months:
                    file_names = self.get_file_names(clim_var, mval, year, month)
                    file_url = urllib.parse.urljoin(self.urls["climate"], "{}/{}{}".format(clim_var, year, month))
                    archivename = "PRISM_{}_stable_4km{}_{}{}_bil.zip".format(clim_var, mval, year, month)
                    self.engine.download_files_from_archive(file_url, file_names, archivename=archivename, keep_in_dir=True)
                    self.engine.register_files(file_names)
Ejemplo n.º 28
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"]
        engine.download_files_from_archive(self.urls["data"], files, filetype="zip")

        # Create table species
        engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table),
                                 filename="Species_list.txt")
        engine.insert_data_from_file(engine.format_filename("Species_list.txt"))

        # Create table sites
        engine.auto_create_table(Table('sites', cleanup=self.cleanup_func_table),
                                 filename="Site_variables.txt")
        engine.insert_data_from_file(engine.format_filename("Site_variables.txt"))

        # Create table microplots
        table = Table('microplots')
        table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))]
        table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9',
                          'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17',
                          'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26',
                          'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35',
                          'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46',
                          'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54',
                          'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62',
                          'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70',
                          'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79',
                          'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88',
                          'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96',
                          'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104']
        table.ct_column = 'PlotID'
        engine.auto_create_table(table, filename="Microplot_data.txt")
        engine.insert_data_from_file(engine.format_filename("Microplot_data.txt"))

        # Create table microplots
        table = Table('macroplots')
        table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5']
        table.ct_column = 'Tree'
        table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')),
                         ('Girth', ('ct-int',))]
        engine.auto_create_table(table, filename="Macroplot_data_Rev.txt")
        engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
Ejemplo n.º 29
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(self.urls["trees"], "LS_trees_1983_2000_original.txt")
        data_path = self.engine.format_filename("LS_trees_1983_2000.txt")
        old_data = open(self.engine.find_file("LS_trees_1983_2000_original.txt"), 'rb')
        new_data = open(data_path, 'wb')

        last_line = None
        for line in old_data:
            if last_line: new_data.write(last_line)
            last_line = line

        new_data.close()
        old_data.close()

        self.engine.auto_create_table(self.tables["trees"],
                                      filename="LS_trees_1983_2000.txt")
        self.engine.insert_data_from_file(data_path)
Ejemplo n.º 30
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine

        taxa = ('Plant', 'Animal')

        for tax in taxa:
            table = Table(tax.lower() + 's', delimiter=',', header_rows = 3, pk='record_id', contains_pk=True)

            columns =     [("record_id"             ,   ("pk-int",)     ),
                           ("station_id"            ,   ("int",)        ),
                           ("obs_date"              ,   ("char",)       ),
                           ("ind_id"                ,   ("int",)        ),
                           ("sci_name"              ,   ("char",)       ),
                           ("com_name"              ,   ("char",)       ),
                           ("kingdom"               ,   ("char",)       ),
                           ("pheno_cat"             ,   ("char",)       ),
                           ("pheno_name"            ,   ("char",)       ),
                           ("pheno_status"          ,   ("char",)       ),
                           ("lat"                   ,   ("double",)     ),
                           ("lon"                   ,   ("double",)     ),
                           ("elevation"             ,   ("int",)        ),
                           ("network_name"          ,   ("char",)       )]
            table.columns = columns

            engine.table = table
            engine.create_table()

            base_url = 'http://www.usanpn.org/getObs/observations/'
            years = range(2009, 2013)

            for year in years:
                if year == 2009 and tax == 'Animal': continue

                url = base_url + 'get%s%sDataNoDefinitions' % (year, tax)

                filename = '%s_%s.csv' % (tax, year)
                engine.download_file(url, filename)

                engine.insert_data_from_file(engine.find_file(filename))

        return engine
Ejemplo n.º 31
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(self.urls["trees"], "LS_trees_1983_2000_original.txt")
        data_path = self.engine.format_filename("LS_trees_1983_2000.txt")
        old_data = open(self.engine.find_file("LS_trees_1983_2000_original.txt"), 'rb')
        new_data = open(data_path, 'wb')

        last_line = None
        for line in old_data:
            if last_line: new_data.write(last_line)
            last_line = line

        new_data.close()
        old_data.close()

        self.engine.auto_create_table(self.tables["trees"],
                                      filename="LS_trees_1983_2000.txt")
        self.engine.insert_data_from_file(data_path)
Ejemplo n.º 32
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"]
        engine.download_files_from_archive(self.urls["data"], files, filetype="zip")

        # Create table species
        engine.auto_create_table(Table('species', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])),
                                 filename="Species_list.txt")
        engine.insert_data_from_file(engine.format_filename("Species_list.txt"))

        # Create table sites
        engine.auto_create_table(Table('sites', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])),
                                 filename="Site_variables.txt")
        engine.insert_data_from_file(engine.format_filename("Site_variables.txt"))

        # Create table microplots
        table = Table('microplots')
        table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))]
        table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9',
                          'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17',
                          'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26',
                          'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35',
                          'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46',
                          'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54',
                          'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62',
                          'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70',
                          'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79',
                          'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88',
                          'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96',
                          'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104']
        table.ct_column = 'PlotID'
        engine.auto_create_table(table, filename="Microplot_data.txt")
        engine.insert_data_from_file(engine.format_filename("Microplot_data.txt"))

        # Create table microplots
        table = Table('macroplots')
        table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5']
        table.ct_column = 'Tree'
        table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')),
                         ('Girth', ('ct-int',))]
        engine.auto_create_table(table, filename="Macroplot_data_Rev.txt")
        engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))
Ejemplo n.º 33
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )),
                         ("sourcename", ("char", )),
                         ("speciesname", ("char", )),
                         ("speciescode", ("char", )),
                         ("invert/vert", ("char", )), ("phylum", ("char", )),
                         ("class", ("char", )), ("order", ("char", )),
                         ("family", ("char", )), ("trophicgild", ("char", )),
                         ("drymass", ("double", )),
                         ("logdrymass", ("double", )),
                         ("ecosystemtype", ("char", )),
                         ("energysource", ("char", )), ("habitat", ("char", )),
                         ("residentecosystem", ("char", )),
                         ("temperature", ("double", )),
                         ("nexcretionrate", ("double", )),
                         ("pexcretionrate", ("double", )),
                         ("lognexcretionrate", ("double", )),
                         ("logpexcretionrate", ("double", )),
                         ("incubationtime", ("double", )),
                         ("nform", ("char", )), ("pform", ("char", )),
                         ("bodyc", ("double", )), ("bodyn", ("double", )),
                         ("bodyp", ("double", )), ("bodyc:n", ("double", )),
                         ("bodyc:p", ("double", )), ("bodyn:p", ("double", )),
                         ("bodydatasource", ("char", )),
                         ("datasource", ("char", )),
                         ("dataproviders", ("char", ))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename],
                                               [filename],
                                               filetype="zip")

        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"],
                                           ["UPSP_Demo_data.txt",
                                            "UPSP_Species_list2.txt"],
                                           archive_type="zip")
        # Create table sp_list(Species)
        filename = "UPSP_Species_list2.txt"
        engine.auto_create_table(
            Table('sp_list', cleanup=self.cleanup_func_table),
            filename=filename)
        engine.insert_data_from_file(engine.format_filename(filename))

        # Create table ind_loc_girth
        filename = "UPSP_Demo_data.txt"
        engine.auto_create_table(
            Table('ind_loc_girth', cleanup=self.cleanup_func_table),
            filename=filename)
        engine.insert_data_from_file(engine.format_filename(filename))
Ejemplo n.º 35
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        # IMG
        request_query = "https://viewer.nationalmap.gov/tnmaccess/api/products?&bbox={}&q=&start=&end=&dateType=&datasets=National+Elevation+Dataset+(NED)+1/3+arc-second&prodFormats=IMG&prodExtents=1+x+1+degree&polyCode=&polyType=&max=40&offset=0&_=1519665242114".format(",".join(str(i) for i in engine.opts["bbox"] if i))
        engine = self.engine
        res = requests.get(request_query).text
        data_url = json.loads(res)

        from retriever.lib.table import RasterDataset
        for item in data_url["items"]:
            engine.download_files_from_archive(item["downloadURL"])
        for raster_files in engine.supported_raster(engine.format_data_dir(),
                                                    [".img"]):
            base_name = os.path.basename(raster_files)
            filename, file_extension = os.path.splitext(base_name)
            table = RasterDataset(name=filename)
            engine.table = table
            engine.auto_create_table(table,
                                     filename=os.path.basename(raster_files))
            engine.insert_raster(raster_files)
Ejemplo n.º 36
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # files are nested in another baad_data folder
        # important files considered (baad_data.csv,baad_methods.csv)
        # relevant files can be added in the same manner

        file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"]
        engine.download_files_from_archive(self.urls["BAAD"], file_names)

        # creating data from baad_data.csv
        engine.auto_create_table(Table("data", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])),
                                 filename="baad_data.csv")
        engine.insert_data_from_file(engine.format_filename("baad_data.csv"))

        # creating methods from baad_methods.csv
        engine.auto_create_table(Table("methods", cleanup=Cleanup(correct_invalid_value, nulls=['NA'])),
                                 filename="baad_methods.csv")
        engine.insert_data_from_file(engine.format_filename("baad_methods.csv"))
Ejemplo n.º 37
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(self.urls["main"], "Succession_sampling_03-07_data_original.txt")
        data_path = self.engine.format_filename("Succession_sampling_03-07_data.txt")
        old_data = open_fr(self.engine.find_file("Succession_sampling_03-07_data_original.txt"))
        new_data = open_fw(data_path)
        # original file's header contains an end of line charactor in the middle hence creating two lines
        # Read in the two lines and create the full header
        line1 = old_data.readline().strip()
        line2 = old_data.readline()
        newline = line1 + "\t" + line2
        new_data.write(newline)
        for line in old_data:
            new_data.write(line)
        new_data.close()
        old_data.close()

        self.engine.auto_create_table(self.tables["main"],
                                      filename="Succession_sampling_03-07_data.txt")
        self.engine.insert_data_from_file(data_path)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        # IMG
        request_query = "https://viewer.nationalmap.gov/tnmaccess/api/products?&bbox={}&q=&start=&end=&dateType=&datasets=National+Elevation+Dataset+(NED)+1/3+arc-second&prodFormats=IMG&prodExtents=1+x+1+degree&polyCode=&polyType=&max=40&offset=0&_=1519665242114".format(
            ",".join(str(i) for i in engine.opts["bbox"] if i))
        engine = self.engine
        res = requests.get(request_query).text
        data_url = json.loads(res)

        from retriever.lib.table import RasterDataset
        for item in data_url["items"]:
            engine.download_files_from_archive(item["downloadURL"])
        for raster_files in engine.supported_raster(engine.format_data_dir(),
                                                    [".img"]):
            base_name = os.path.basename(raster_files)
            filename, file_extension = os.path.splitext(base_name)
            table = RasterDataset(name=filename)
            engine.table = table
            engine.auto_create_table(table,
                                     filename=os.path.basename(raster_files))
            engine.insert_raster(raster_files)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # files are nested in another baad_data folder
        # important files considered (baad_data.csv,baad_methods.csv)
        # relevant files can be added in the same manner

        file_names = ["baad_data/baad_data.csv", "baad_data/baad_methods.csv"]
        engine.download_files_from_archive(self.urls["BAAD"], file_names)

        # creating data from baad_data.csv
        if parse_version(VERSION).__str__() >= parse_version(
                "2.1.dev").__str__():
            filename = "baad_data/baad_data.csv"
            engine.auto_create_table(Table("data",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
        else:
            filename = "baad_data.csv"
            engine.auto_create_table(Table("data",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))

        # creating methods from baad_methods.csv
        if parse_version(VERSION).__str__() >= parse_version(
                "2.1.dev").__str__():
            filename = "baad_data/baad_methods.csv"
            engine.auto_create_table(Table("methods",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
        else:
            filename = "baad_methods.csv"
            engine.auto_create_table(Table("methods",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
Ejemplo n.º 40
0
 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     
     self.engine.download_file(self.urls["main"], "Succession_sampling_03-07_data_original.txt")
     data_path = self.engine.format_filename("Succession_sampling_03-07_data.txt")
     old_data = open(self.engine.find_file("Succession_sampling_03-07_data_original.txt"), 'rb')
     new_data = open(data_path, 'wb')
     
     line1 = old_data.readline()
     line2 = old_data.readline()
     newline = line1.replace("\n", "\t") + line2
     new_data.write(newline)
     
     for line in old_data:
         new_data.write(line)
     
     new_data.close()
     old_data.close()
     
     self.engine.auto_create_table(self.tables["main"], 
                                   filename="Succession_sampling_03-07_data.txt")
     self.engine.insert_data_from_file(data_path)
Ejemplo n.º 41
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(key,
                                                cleanup=self.cleanup_func_table), filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # files are nested in another baad_data folder
        # important files considered (baad_data.csv,baad_methods.csv)
        # relevant files can be added in the same manner

        file_names = ["baad_data/baad_data.csv",
                      "baad_data/baad_methods.csv"]
        engine.download_files_from_archive(self.urls["BAAD"], file_names)

        # creating data from baad_data.csv
        if parse_version(VERSION).__str__() >= parse_version("2.1.dev").__str__():
            filename = "baad_data/baad_data.csv"
            engine.auto_create_table(Table("data",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
        else:
            filename = "baad_data.csv"
            engine.auto_create_table(Table("data",
                                           cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))

        # creating methods from baad_methods.csv
        if parse_version(VERSION).__str__() >= parse_version("2.1.dev").__str__():
            filename = "baad_data/baad_methods.csv"
            engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
        else:
            filename = "baad_methods.csv"
            engine.auto_create_table(Table("methods", cleanup=self.cleanup_func_table),
                                     filename=filename)
            engine.insert_data_from_file(engine.format_filename(filename))
Ejemplo n.º 43
0
 def download(self, engine=None, debug=False):
     data_file_name = "eBird_Observation_Dataset_2013.csv"
     Script.download(self, engine, debug)
     self.engine.download_files_from_archive(self.urls["main"],
                                             [data_file_name],
                                             filetype='gz')
     table = (Table("main", delimiter=","))
     table.columns=[("BASISOFRECORD",("char", )),
                    ("INSTITUTIONCODE",("char", )),
                    ("COLLECTIONCODE",("char", )),
                    ("CATALOGNUMBER",("char", )),
                    ("OCCURRENCEID",("char", )),
                    ("RECORDEDBY",("char", )),
                    ("YEAR",("int", )),
                    ("MONTH",("int", )),
                    ("DAY",("int", )),
                    ("COUNTRY",("char", )),
                    ("STATEPROVINCE",("char", )),
                    ("COUNTY",("char", )),
                    ("DECIMALLATITUDE",("double", )),
                    ("DECIMALLONGITUDE",("double", )),
                    ("LOCALITY",("char", )),
                    ("KINGDOM",("char", )),
                    ("PHYLUM",("char", )),
                    ("CLASS",("char", )),
                    ("SPORDER",("char", )),
                    ("FAMILY",("char", )),
                    ("GENUS",("char", )),
                    ("SPECIFICEPITHET",("char", )),
                    ("SCIENTIFICNAME",("char", )),
                    ("VERNACULARNAME",("char", )),
                    ("INDIVIDUALCOUNT",("int", ))]
     engine.table = table
     engine.create_table()
     engine.insert_data_from_file(engine.format_filename(data_file_name))
     return engine
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # Download both full and abbreviated versions and extract the data files
        abbrev_version = ["ABBREV.txt"]
        full_version = [
            "DERIV_CD.txt", "FOOTNOTE.txt", "NUTR_DEF.txt", "WEIGHT.txt",
            "DATA_SRC.txt", "FD_GROUP.txt", "LANGDESC.txt", "NUT_DATA.txt",
            "DATSRCLN.txt", "FOOD_DES.txt", "LANGUAL.txt", "SRC_CD.txt"
        ]

        self.engine.download_files_from_archive(self.urls["full_version"],
                                                archive_type="zip",
                                                file_names=full_version)
        self.engine.download_files_from_archive(
            self.urls["abbreviated_version"],
            archive_type="zip",
            file_names=abbrev_version,
        )

        # Convert original txt to csv
        convert_to_csv(self.engine.format_data_dir())

        # FOOD_DES table
        new_file_name = "food_des.csv"
        table = Table("food_des", delimiter=",", header_rows=0)
        table.columns = [
            ("ndb_no", ("int", )),
            ("fdgrp_cd", ("int", )),
            ("long_desc", ("char", "205")),
            ("shrt_desc", ("char", "65")),
            ("comname", ("char", "105")),
            ("manufacname", ("char", "70")),
            ("survey", ("char", "1")),
            ("ref_desc", ("char", "140")),
            ("refuse", ("double", )),
            ("sciname", ("char", "67")),
            ("n_factor", ("double", )),
            ("pro_factor", ("double", )),
            ("fat_factor", ("double", )),
            ("cho_factor", ("double", )),
        ]
        self.create_and_install(new_file_name, table)

        # FdGrp_Cd table
        new_file_name = "fd_group.csv"
        table = Table("fd_group", delimiter=",", header_rows=0)
        table.columns = [("fdgrp_cd", ("int", )),
                         ("fdgrp_desc", ("char", "65"))]
        self.create_and_install(new_file_name, table)

        # LANGUAL table
        new_file_name = "langual.csv"
        table = Table("langual", delimiter=",", header_rows=0)
        table.columns = [("ndb_no", ("int", )), ("factor_code", ("char", "5"))]
        self.create_and_install(new_file_name, table)

        # LANGDESC Table
        new_file_name = "langdesc.csv"
        table = Table("langdesc", delimiter=",", header_rows=0)
        table.columns = [
            ("factor_code", ("char", "5")),
            ("description", ("char", "145")),
        ]
        self.create_and_install(new_file_name, table)

        # NUT_DATA table
        new_file_name = "nut_data.csv"
        missingValues = [
            "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9",
            "Unnamed: 10", "Unnamed: 11", "Unnamed: 12", "Unnamed: 13",
            "Unnamed: 14", "Unnamed: 15", "Unnamed: 17"
        ]
        table = Table(
            "nut_data",
            delimiter=",",
            header_rows=0,
            missingValues=missingValues,
            do_not_bulk_insert=True,
        )
        table.columns = [
            ("ndb_no", ("int", )),
            ("nutr_no", ("int", )),
            ("nutr_val", ("double", )),
            ("num_data_pts", ("int", )),
            ("std_error", ("double", )),
            ("src_cd", ("int", )),
            ("deriv_cd", ("char", "12")),
            ("ref_ndb_no", ("double", )),
            ("add_nutr_mark", ("char", "12")),
            ("num_studies", ("double", )),
            ("min", ("double", )),
            ("max", ("double", )),
            ("df", ("double", )),
            ("low_eb", ("double", )),
            ("up_eb", ("double", )),
            ("stat_cmt", ("char", "12")),
            ("addmod_date", ("char", "12")),
            ("cc", ("char", "12")),
        ]
        self.create_and_install(new_file_name, table)

        # NUTR_DEF table
        new_file_name = "nutr_def.csv"
        table = Table("nutr_def", delimiter=",", header_rows=0)
        table.columns = [
            ("nutr_no", ("int", )),
            ("units", ("char", "10")),
            ("tagname", ("char", "25")),
            ("nutrdesc", ("char", "60")),
            ("num_dec", ("int", )),
            ("sr_order", ("int", )),
        ]
        self.create_and_install(new_file_name, table)

        # SRC_CD table
        new_file_name = "src_cd.csv"
        table = Table("src_cd", delimiter=",", header_rows=0)
        table.columns = [("src_cd", ("int", )), ("srccd_desc", ("char", "65"))]
        self.create_and_install(new_file_name, table)

        # DERIV_CD table
        new_file_name = "deriv_cd.csv"
        table = Table("deriv_cd", delimiter=",", header_rows=0)
        table.columns = [("deriv_cd", ("char", "5")),
                         ("deriv_desc", ("char", "130"))]
        self.create_and_install(new_file_name, table)

        # WEIGHT table
        new_file_name = "weight.csv"
        table = Table(
            "weight",
            delimiter=",",
            header_rows=0,
            missingValues=["Unnamed: 5", "Unnamed: 6"],
        )
        table.columns = [
            ("ndb_no", ("int", )),
            ("seq", ("int", )),
            ("amount", ("double", )),
            ("msre_desc", ("char", "130")),
            ("gm_wgt", ("double", )),
            ("num_data_pts", ("double", )),
            ("std_dev", ("double", )),
        ]
        self.create_and_install(new_file_name, table)

        # FOOTNOTE table
        new_file_name = "footnote.csv"
        table = Table("footnote",
                      delimiter=",",
                      header_rows=0,
                      missingValues=["Unnamed: 3"])
        table.columns = [
            ("ndb_no", ("int", )),
            ("footnt_no", ("int", )),
            ("footnt_typ", ("char", "2")),
            ("nutr_no", ("double", )),
            ("footnt_txt", ("char", "200")),
        ]
        self.create_and_install(new_file_name, table)

        # DATSRCLN table
        new_file_name = "datsrcln.csv"
        table = Table("datsrcln", delimiter=",", header_rows=0)
        table.columns = [
            ("ndb_no", ("int", )),
            ("nutr_no", ("int", )),
            ("datasrc_id", ("char", "7")),
        ]

        self.create_and_install(new_file_name, table)

        # DATA_SRC table
        new_file_name = "data_src.csv"
        table = Table("data_src", delimiter=",", header_rows=0)
        table.columns = [
            ("datasrc_id", ("char", "7")),
            ("authors", ("char", "257")),
            ("title", ("char", "257")),
            ("year", ("char", "5")),
            ("journal", ("char", "137")),
            ("vol_city", ("char", "17")),
            ("issue_state", ("char", "5")),
            ("start_page", ("char", "5")),
            ("end_page", ("char", "5")),
        ]
        self.create_and_install(new_file_name, table)

        # ABBREV table
        new_file_name = "abbrev.csv"
        table = Table("abbrev", delimiter=",", header_rows=0)
        table.columns = [
            ("ndb_no", ("char", "7")),
            ("shrt_desc", ("char", "60")),
            ("water", ("double", )),
            ("energ_kcal", ("int", )),
            ("protein", ("double", )),
            ("lipid_tot", ("double", )),
            ("ash", ("double", )),
            ("carbohydrt", ("double", )),
            ("fiber_td", ("double", )),
            ("sugar_tot", ("char", "6")),
            ("calcium", ("int", )),
            ("iron", ("double", )),
            ("magnesium", ("int", )),
            ("phosphorus", ("int", )),
            ("potassium", ("int", )),
            ("sodium", ("int", )),
            ("zinc", ("double", )),
            ("copper", ("double", )),
            ("manganese", ("double", )),
            ("selenium", ("double", )),
            ("vit_c", ("double", )),
            ("thiamin", ("double", )),
            ("riboflavin", ("double", )),
            ("niacin", ("double", )),
            ("panto_acid", ("double", )),
            ("vit_b6", ("double", )),
            ("folate_tot", ("int", )),
            ("folic_acid", ("int", )),
            ("food_folate", ("int", )),
            ("folate_dfe", ("int", )),
            ("choline_tot", ("double", )),
            ("vit_b12", ("double", )),
            ("vit_a_iu", ("int", )),
            ("vit_a_rae", ("int", )),
            ("retinol", ("int", )),
            ("alpha_carot", ("int", )),
            ("beta_carot", ("int", )),
            ("beta_crypt", ("int", )),
            ("lycopene", ("int", )),
            ("lut_zea", ("int", )),
            ("vit_e", ("double", )),
            ("vit_d_mcg", ("double", )),
            ("vit_d_iu", ("int", )),
            ("vit_k", ("double", )),
            ("fa_sat", ("double", )),
            ("fa_mono", ("double", )),
            ("fa_poly", ("double", )),
            ("cholestrl", ("int", )),
            ("gmwt_1", ("double", )),
            ("gmwt_desc1", ("char", "80")),
            ("gmwt_2", ("double", )),
            ("gmwt_desc2", ("char", "80")),
            ("refuse_pct", ("int", )),
        ]
        self.create_and_install(new_file_name, table)
Ejemplo n.º 45
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine

        table = self.tables["mass"]

        # Database column names and their data types. Use data type "skip" to skip the value, and
        # "combine" to merge a string value into the previous column
        table.columns=[("record_id"             ,   ("pk-auto",)    ),
                       ("family"                ,   ("char", 20)    ),
                       ("genus"                 ,   ("char", 20)    ),
                       ("species"               ,   ("char", 20)    ),
                       ("subspecies"            ,   ("char", 20)    ),
                       ("common_name"           ,   ("char", 50)    ),
                       ("sex"                   ,   ("char", 20)    ),
                       ("N"                     ,   ("double",)     ),
                       ("mean"                  ,   ("double",)     ),
                       ("std_dev"               ,   ("double",)     ),
                       ("min"                   ,   ("double",)     ),
                       ("max"                   ,   ("double",)     ),
                       ("season"                ,   ("char",2)      ),
                       ("location"              ,   ("char",50)     ),
                       ("source_num"            ,   ("char",50)     )]
        engine.table = table
        engine.create_table()

        file_list = ["broadbills - tapaculos", "cotingas - NZ wrens",
                     "HA honeycreepers - icterids", "honeyeaters - corvids",
                     "jacanas - doves", "larks - accentors",
                     "muscicapids - babblers", "ostrich - waterfowl",
                     "parrotbills - sugarbirds", "parrots - nightjars",
                     "starlings - finches", "swifts - woodpeckers",
                     "thrushes - gnatcatchers", "vultures - bustards"]

        lines = []

        for file in file_list:
            filename = file + ".xls"
            full_filename = engine.format_filename(filename)

            # Make sure file exists
            if not os.path.isfile(full_filename):
                raise Exception("Missing raw data file: " + full_filename)

            # Open excel file with xlrd
            book = xlrd.open_workbook(full_filename)
            sh = book.sheet_by_index(0)

            print "Inserting data from " + filename + " . . ."
            rows = sh.nrows
            cols = 11
            lines = []
            lastrow = None
            lastvalues = None
            family = ""
            for n in range(rows):
                row = sh.row(n)
                if len(row) == 0:
                    continue

                empty_cols = len([cell for cell in row[0:11] if Excel.empty_cell(cell)])

                # Skip this row if all cells or all cells but one are empty
                # or if it's the legend row
                if ((empty_cols == cols)
                            or Excel.cell_value(row[0]) == "Scientific Name"
                            or Excel.cell_value(row[0])[0:7] == "Species"):
                    pass
                elif empty_cols == cols - 1:
                    if "Family" in Excel.cell_value(row[0]):
                        family = Excel.cell_value(row[0]).lstrip("Family ").title()
                        continue
                    else:
                        if not Excel.empty_cell(row[0]):
                            lastvalues[3] = Excel.cell_value(row[0])
                else:
                    # Values: 0=Family 1=Genus 2=Species 3=Subspecies 4=common name 5=sex
                    # 6=N 7=Mean 8=std_dev 9=min 10=max 11=season 12=location 13=source_num
                    values = []
                    values.append(family)
                    # If the first two columns are empty, but not all of them are,
                    # use the first two columns from the previous row
                    if Excel.empty_cell(row[0]) and Excel.empty_cell(row[1]):
                        [values.append(value) for value in sci_name(Excel.cell_value(lastrow[0]))]
                        values.append(Excel.cell_value(lastrow[1]))
                    else:
                        if len(Excel.cell_value(row[0]).split()) == 1:
                            # If the scientific name is missing genus/species, fill it
                            # in from the previous row
                            values.append(lastvalues[1])
                            values.append(lastvalues[2])
                            values.append(lastvalues[3])
                            for i in range(0, 3):
                                if not values[3-i]:
                                    values[3-i] = Excel.cell_value(row[0])
                                    break
                            # Add new information to the previous scientific name
                            if lastvalues:
                                lastvalues[1:4] = values[1:4]
                        else:
                            [values.append(value) for value in sci_name(Excel.cell_value(row[0]))]
                        values.append(Excel.cell_value(row[1]))

                    if Excel.cell_value(row[2]) == "M":
                        values.append("Male")
                    elif Excel.cell_value(row[2]) == "F":
                        values.append("Female")
                    elif Excel.cell_value(row[2]) == "B":
                        values.append("Both")
                    elif Excel.cell_value(row[2]) == "U":
                        values.append("Unknown")
                    else:
                        values.append(Excel.cell_value(row[2]))

                    # Enter remaining values from cells
                    for i in range(3, cols):
                        values.append(Excel.cell_value(row[i]))

                    # If there isn't a common name or location, get it from
                    # the previous row
                    if not values[4]:
                        values[4] = lastvalues[4]
                    if not values[12]:
                        if lastvalues:
                            if lastvalues[5]:
                                if lastvalues[5] == "Male" and values[5] == "Female":
                                    values[12] = lastvalues[12]

                    # Insert the previous row into the database
                    if lastvalues:
                        lines.append('~'.join(lastvalues))

                    lastrow = row
                    lastvalues = values

            if lines:
                lines.append('~'.join(lastvalues))
                engine.add_to_table(lines)

        return engine
Ejemplo n.º 46
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'vertnet_latest_reptiles.csv'
        tablename = 'reptiles'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("record_id", ("pk-auto",)),
            ("beginrecord", ("char",)),
            ("icode", ("char",)),
            ("title", ("char",)),
            ("citation", ("char",)),
            ("contact", ("char",)),
            ("email", ("char",)),
            ("emlrights", ("char",)),
            ("gbifdatasetid", ("char",)),
            ("gbifpublisherid", ("char",)),
            ("doi", ("char",)),
            ("migrator", ("char",)),
            ("networks", ("char",)),
            ("orgcountry", ("char",)),
            ("orgname", ("char",)),
            ("orgstateprovince", ("char",)),
            ("pubdate", ("char",)),
            ("source_url", ("char",)),
            ("iptrecordid", ("char",)),
            ("associatedmedia", ("char",)),
            ("associatedoccurrences", ("char",)),
            ("associatedorganisms", ("char",)),
            ("associatedreferences", ("char",)),
            ("associatedsequences", ("char",)),
            ("associatedtaxa", ("char",)),
            ("bed", ("char",)),
            ("behavior", ("char",)),
            ("catalognumber", ("char",)),
            ("continent", ("char",)),
            ("coordinateprecision", ("char",)),
            ("coordinateuncertaintyinmeters", ("char",)),
            ("country", ("char",)),
            ("countrycode", ("char",)),
            ("county", ("char",)),
            ("dateidentified", ("char",)),
            ("day", ("char",)),
            ("decimallatitude", ("char",)),
            ("decimallongitude", ("char",)),
            ("disposition", ("char",)),
            ("earliestageorloweststage", ("char",)),
            ("earliesteonorlowesteonothem", ("char",)),
            ("earliestepochorlowestseries", ("char",)),
            ("earliesteraorlowesterathem", ("char",)),
            ("earliestperiodorlowestsystem", ("char",)),
            ("enddayofyear", ("char",)),
            ("establishmentmeans", ("char",)),
            ("eventdate", ("char",)),
            ("eventid", ("char",)),
            ("eventremarks", ("char",)),
            ("eventtime", ("char",)),
            ("fieldnotes", ("char",)),
            ("fieldnumber", ("char",)),
            ("footprintspatialfit", ("char",)),
            ("footprintsrs", ("char",)),
            ("footprintwkt", ("char",)),
            ("formation", ("char",)),
            ("geodeticdatum", ("char",)),
            ("geologicalcontextid", ("char",)),
            ("georeferencedby", ("char",)),
            ("georeferenceddate", ("char",)),
            ("georeferenceprotocol", ("char",)),
            ("georeferenceremarks", ("char",)),
            ("georeferencesources", ("char",)),
            ("georeferenceverificationstatus", ("char",)),
            ("group", ("char",)),
            ("habitat", ("char",)),
            ("highergeography", ("char",)),
            ("highergeographyid", ("char",)),
            ("highestbiostratigraphiczone", ("char",)),
            ("identificationid", ("char",)),
            ("identificationqualifier", ("char",)),
            ("identificationreferences", ("char",)),
            ("identificationremarks", ("char",)),
            ("identificationverificationstatus", ("char",)),
            ("identifiedby", ("char",)),
            ("individualcount", ("char",)),
            ("island", ("char",)),
            ("islandgroup", ("char",)),
            ("latestageorhigheststage", ("char",)),
            ("latesteonorhighesteonothem", ("char",)),
            ("latestepochorhighestseries", ("char",)),
            ("latesteraorhighesterathem", ("char",)),
            ("latestperiodorhighestsystem", ("char",)),
            ("lifestage", ("char",)),
            ("lithostratigraphicterms", ("char",)),
            ("locality", ("char",)),
            ("locationaccordingto", ("char",)),
            ("locationid", ("char",)),
            ("locationremarks", ("char",)),
            ("lowestbiostratigraphiczone", ("char",)),
            ("materialsampleid", ("char",)),
            ("maximumdepthinmeters", ("char",)),
            ("maximumdistanceabovesurfaceinmeters", ("char",)),
            ("maximumelevationinmeters", ("char",)),
            ("member", ("char",)),
            ("minimumdepthinmeters", ("char",)),
            ("minimumdistanceabovesurfaceinmeters", ("char",)),
            ("minimumelevationinmeters", ("char",)),
            ("month", ("char",)),
            ("municipality", ("char",)),
            ("occurrenceid", ("char",)),
            ("occurrenceremarks", ("char",)),
            ("occurrencestatus", ("char",)),
            ("organismid", ("char",)),
            ("organismname", ("char",)),
            ("organismremarks", ("char",)),
            ("organismscope", ("char",)),
            ("othercatalognumbers", ("char",)),
            ("pointradiusspatialfit", ("char",)),
            ("preparations", ("char",)),
            ("previousidentifications", ("char",)),
            ("recordedby", ("char",)),
            ("recordnumber", ("char",)),
            ("reproductivecondition", ("char",)),
            ("samplingeffort", ("char",)),
            ("samplingprotocol", ("char",)),
            ("sex", ("char",)),
            ("startdayofyear", ("char",)),
            ("stateprovince", ("char",)),
            ("typestatus", ("char",)),
            ("verbatimcoordinates", ("char",)),
            ("verbatimcoordinatesystem", ("char",)),
            ("verbatimdepth", ("char",)),
            ("verbatimelevation", ("char",)),
            ("verbatimeventdate", ("char",)),
            ("verbatimlatitude", ("char",)),
            ("verbatimlocality", ("char",)),
            ("verbatimlongitude", ("char",)),
            ("verbatimsrs", ("char",)),
            ("waterbody", ("char",)),
            ("year", ("char",)),
            ("dctype", ("char",)),
            ("modified", ("char",)),
            ("language", ("char",)),
            ("license", ("char",)),
            ("rightsholder", ("char",)),
            ("accessrights", ("char",)),
            ("bibliographiccitation", ("char",)),
            ("dc_references", ("char",)),
            ("institutionid", ("char",)),
            ("collectionid", ("char",)),
            ("datasetid", ("char",)),
            ("institutioncode", ("char",)),
            ("collectioncode", ("char",)),
            ("datasetname", ("char",)),
            ("ownerinstitutioncode", ("char",)),
            ("basisofrecord", ("char",)),
            ("informationwithheld", ("char",)),
            ("datageneralizations", ("char",)),
            ("dynamicproperties", ("char",)),
            ("scientificnameid", ("char",)),
            ("namepublishedinid", ("char",)),
            ("scientificname", ("char",)),
            ("acceptednameusage", ("char",)),
            ("originalnameusage", ("char",)),
            ("namepublishedin", ("char",)),
            ("namepublishedinyear", ("char",)),
            ("higherclassification", ("char",)),
            ("kingdom", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("genus", ("char",)),
            ("subgenus", ("char",)),
            ("specificepithet", ("char",)),
            ("infraspecificepithet", ("char",)),
            ("taxonrank", ("char",)),
            ("verbatimtaxonrank", ("char",)),
            ("scientificnameauthorship", ("char",)),
            ("vernacularname", ("char",)),
            ("nomenclaturalcode", ("char",)),
            ("taxonomicstatus", ("char",)),
            ("keyname", ("char",)),
            ("haslicense", ("int",)),
            ("vntype", ("char",)),
            ("rank", ("int",)),
            ("mappable", ("int",)),
            ("hashid", ("char",)),
            ("hastypestatus", ("int",)),
            ("wascaptive", ("int",)),
            ("wasinvasive", ("int",)),
            ("hastissue", ("int",)),
            ("hasmedia", ("int",)),
            ("isfossil", ("int",)),
            ("haslength", ("int",)),
            ("haslifestage", ("int",)),
            ("hasmass", ("int",)),
            ("hassex", ("int",)),
            ("lengthinmm", ("double",)),
            ("massing", ("double",)),
            ("lengthunitsinferred", ("char",)),
            ("massunitsinferred", ("char",)),
            ("underivedlifestage", ("char",)),
            ("underivedsex", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename],
                                               [filename],
                                               "zip",
                                               False,
                                               "vertnet_latest_" + str(tablename))
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Ejemplo n.º 47
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'vertnet_latest_mammals.csv'
        tablename = 'mammals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("record_id", ("pk-auto",)),
            ("beginrecord", ("char",)),
            ("icode", ("char",)),
            ("title", ("char",)),
            ("citation", ("char",)),
            ("contact", ("char",)),
            ("email", ("char",)),
            ("emlrights", ("char",)),
            ("gbifdatasetid", ("char",)),
            ("gbifpublisherid", ("char",)),
            ("doi", ("char",)),
            ("migrator", ("char",)),
            ("networks", ("char",)),
            ("orgcountry", ("char",)),
            ("orgname", ("char",)),
            ("orgstateprovince", ("char",)),
            ("pubdate", ("char",)),
            ("source_url", ("char",)),
            ("iptrecordid", ("char",)),
            ("associatedmedia", ("char",)),
            ("associatedoccurrences", ("char",)),
            ("associatedorganisms", ("char",)),
            ("associatedreferences", ("char",)),
            ("associatedsequences", ("char",)),
            ("associatedtaxa", ("char",)),
            ("bed", ("char",)),
            ("behavior", ("char",)),
            ("catalognumber", ("char",)),
            ("continent", ("char",)),
            ("coordinateprecision", ("char",)),
            ("coordinateuncertaintyinmeters", ("char",)),
            ("country", ("char",)),
            ("countrycode", ("char",)),
            ("county", ("char",)),
            ("dateidentified", ("char",)),
            ("day", ("char",)),
            ("decimallatitude", ("char",)),
            ("decimallongitude", ("char",)),
            ("disposition", ("char",)),
            ("earliestageorloweststage", ("char",)),
            ("earliesteonorlowesteonothem", ("char",)),
            ("earliestepochorlowestseries", ("char",)),
            ("earliesteraorlowesterathem", ("char",)),
            ("earliestperiodorlowestsystem", ("char",)),
            ("enddayofyear", ("char",)),
            ("establishmentmeans", ("char",)),
            ("eventdate", ("char",)),
            ("eventid", ("char",)),
            ("eventremarks", ("char",)),
            ("eventtime", ("char",)),
            ("fieldnotes", ("char",)),
            ("fieldnumber", ("char",)),
            ("footprintspatialfit", ("char",)),
            ("footprintsrs", ("char",)),
            ("footprintwkt", ("char",)),
            ("formation", ("char",)),
            ("geodeticdatum", ("char",)),
            ("geologicalcontextid", ("char",)),
            ("georeferencedby", ("char",)),
            ("georeferenceddate", ("char",)),
            ("georeferenceprotocol", ("char",)),
            ("georeferenceremarks", ("char",)),
            ("georeferencesources", ("char",)),
            ("georeferenceverificationstatus", ("char",)),
            ("group", ("char",)),
            ("habitat", ("char",)),
            ("highergeography", ("char",)),
            ("highergeographyid", ("char",)),
            ("highestbiostratigraphiczone", ("char",)),
            ("identificationid", ("char",)),
            ("identificationqualifier", ("char",)),
            ("identificationreferences", ("char",)),
            ("identificationremarks", ("char",)),
            ("identificationverificationstatus", ("char",)),
            ("identifiedby", ("char",)),
            ("individualcount", ("char",)),
            ("island", ("char",)),
            ("islandgroup", ("char",)),
            ("latestageorhigheststage", ("char",)),
            ("latesteonorhighesteonothem", ("char",)),
            ("latestepochorhighestseries", ("char",)),
            ("latesteraorhighesterathem", ("char",)),
            ("latestperiodorhighestsystem", ("char",)),
            ("lifestage", ("char",)),
            ("lithostratigraphicterms", ("char",)),
            ("locality", ("char",)),
            ("locationaccordingto", ("char",)),
            ("locationid", ("char",)),
            ("locationremarks", ("char",)),
            ("lowestbiostratigraphiczone", ("char",)),
            ("materialsampleid", ("char",)),
            ("maximumdepthinmeters", ("char",)),
            ("maximumdistanceabovesurfaceinmeters", ("char",)),
            ("maximumelevationinmeters", ("char",)),
            ("member", ("char",)),
            ("minimumdepthinmeters", ("char",)),
            ("minimumdistanceabovesurfaceinmeters", ("char",)),
            ("minimumelevationinmeters", ("char",)),
            ("month", ("char",)),
            ("municipality", ("char",)),
            ("occurrenceid", ("char",)),
            ("occurrenceremarks", ("char",)),
            ("occurrencestatus", ("char",)),
            ("organismid", ("char",)),
            ("organismname", ("char",)),
            ("organismremarks", ("char",)),
            ("organismscope", ("char",)),
            ("othercatalognumbers", ("char",)),
            ("pointradiusspatialfit", ("char",)),
            ("preparations", ("char",)),
            ("previousidentifications", ("char",)),
            ("recordedby", ("char",)),
            ("recordnumber", ("char",)),
            ("reproductivecondition", ("char",)),
            ("samplingeffort", ("char",)),
            ("samplingprotocol", ("char",)),
            ("sex", ("char",)),
            ("startdayofyear", ("char",)),
            ("stateprovince", ("char",)),
            ("typestatus", ("char",)),
            ("verbatimcoordinates", ("char",)),
            ("verbatimcoordinatesystem", ("char",)),
            ("verbatimdepth", ("char",)),
            ("verbatimelevation", ("char",)),
            ("verbatimeventdate", ("char",)),
            ("verbatimlatitude", ("char",)),
            ("verbatimlocality", ("char",)),
            ("verbatimlongitude", ("char",)),
            ("verbatimsrs", ("char",)),
            ("waterbody", ("char",)),
            ("year", ("char",)),
            ("dctype", ("char",)),
            ("modified", ("char",)),
            ("language", ("char",)),
            ("license", ("char",)),
            ("rightsholder", ("char",)),
            ("accessrights", ("char",)),
            ("bibliographiccitation", ("char",)),
            ("dc_references", ("char",)),
            ("institutionid", ("char",)),
            ("collectionid", ("char",)),
            ("datasetid", ("char",)),
            ("institutioncode", ("char",)),
            ("collectioncode", ("char",)),
            ("datasetname", ("char",)),
            ("ownerinstitutioncode", ("char",)),
            ("basisofrecord", ("char",)),
            ("informationwithheld", ("char",)),
            ("datageneralizations", ("char",)),
            ("dynamicproperties", ("char",)),
            ("scientificnameid", ("char",)),
            ("namepublishedinid", ("char",)),
            ("scientificname", ("char",)),
            ("acceptednameusage", ("char",)),
            ("originalnameusage", ("char",)),
            ("namepublishedin", ("char",)),
            ("namepublishedinyear", ("char",)),
            ("higherclassification", ("char",)),
            ("kingdom", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("genus", ("char",)),
            ("subgenus", ("char",)),
            ("specificepithet", ("char",)),
            ("infraspecificepithet", ("char",)),
            ("taxonrank", ("char",)),
            ("verbatimtaxonrank", ("char",)),
            ("scientificnameauthorship", ("char",)),
            ("vernacularname", ("char",)),
            ("nomenclaturalcode", ("char",)),
            ("taxonomicstatus", ("char",)),
            ("keyname", ("char",)),
            ("haslicense", ("int",)),
            ("vntype", ("char",)),
            ("rank", ("int",)),
            ("mappable", ("int",)),
            ("hashid", ("char",)),
            ("hastypestatus", ("int",)),
            ("wascaptive", ("int",)),
            ("wasinvasive", ("int",)),
            ("hastissue", ("int",)),
            ("hasmedia", ("int",)),
            ("isfossil", ("int",)),
            ("haslength", ("int",)),
            ("haslifestage", ("int",)),
            ("hasmass", ("int",)),
            ("hassex", ("int",)),
            ("lengthinmm", ("double",)),
            ("massing", ("double",)),
            ("lengthunitsinferred", ("char",)),
            ("massunitsinferred", ("char",)),
            ("underivedlifestage", ("char",)),
            ("underivedsex", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip", archivename="vertnet_latest_" + str(tablename))
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Ejemplo n.º 48
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = ["observation_id",
                         "update_datetime",
                         "site_id",
                         "latitude",
                         "longitude",
                         "elevation_in_meters",
                         "state",
                         "species_id",
                         "genus",
                         "species",
                         "common_name",
                         "kingdom",
                         "individual_id",
                         "phenophase_id",
                         "phenophase_description",
                         "observation_date",
                         "day_of_year",
                         "phenophase_status",
                         "intensity_category_id",
                         "intensity_value",
                         "abundance_value"
                         ]

        columns = [("record_id", ("pk-auto",)),
                   ("observation_id", ("int",)),  # subsequently refered to as "status record"
                   ("update_datetime", ("char",)),
                   ("site_id", ("int",)),
                   ("latitude", ("double",)),
                   ("longitude", ("double",)),
                   ("elevation_in_meters", ("char",)),
                   ("state", ("char",)),
                   ("species_id", ("int",)),
                   ("genus", ("char",)),
                   ("species", ("char",)),
                   ("common_name", ("char",)),
                   ("kingdom", ("char",)),  # skip kingdom
                   ("individual_id", ("char",)),
                   ("phenophase_id", ("int",)),
                   ("phenophase_description", ("char",)),
                   ("observation_date", ("char",)),
                   ("day_of_year", ("char",)),
                   ("phenophase_status", ("char",)),
                   ("intensity_category_id", ("char",)),
                   ("intensity_value", ("char",)),
                   ("abundance_value", ("char",))
                   ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine
Ejemplo n.º 49
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=11)
            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"],
                                               archive_name="routes.zip")
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            engine.download_files_from_archive(self.urls["weather"],
                                               ["weather.csv"],
                                               archive_name="weather.zip")
            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather.csv")
            engine.insert_data_from_file(engine.format_filename("weather.csv"))

            # Migrations data
            engine.download_files_from_archive(
                self.urls["migrants"], archive_name="MigrantNonBreeder.zip")
            engine.extract_zip(
                engine.format_filename("MigrantNonBreeder/Migrants.zip"),
                engine.format_filename("Migrant"),
            )
            engine.extract_zip(
                engine.format_filename("MigrantNonBreeder/MigrantSummary.zip"),
                engine.format_filename("MigrantSummary"),
            )

            table = Table("migrants", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('aou', ('int', )), ('stop1', ('int', )),
                             ('stop2', ('int', )), ('stop3', ('int', )),
                             ('stop4', ('int', )), ('stop5', ('int', )),
                             ('stop6', ('int', )), ('stop7', ('int', )),
                             ('stop8', ('int', )), ('stop9', ('int', )),
                             ('stop10', ('int', )), ('stop11', ('int', )),
                             ('stop12', ('int', )), ('stop13', ('int', )),
                             ('stop14', ('int', )), ('stop15', ('int', )),
                             ('stop16', ('int', )), ('stop17', ('int', )),
                             ('stop18', ('int', )), ('stop19', ('int', )),
                             ('stop20', ('int', )), ('stop21', ('int', )),
                             ('stop22', ('int', )), ('stop23', ('int', )),
                             ('stop24', ('int', )), ('stop25', ('int', )),
                             ('stop26', ('int', )), ('stop27', ('int', )),
                             ('stop28', ('int', )), ('stop29', ('int', )),
                             ('stop30', ('int', )), ('stop31', ('int', )),
                             ('stop32', ('int', )), ('stop33', ('int', )),
                             ('stop34', ('int', )), ('stop35', ('int', )),
                             ('stop36', ('int', )), ('stop37', ('int', )),
                             ('stop38', ('int', )), ('stop39', ('int', )),
                             ('stop40', ('int', )), ('stop41', ('int', )),
                             ('stop42', ('int', )), ('stop43', ('int', )),
                             ('stop44', ('int', )), ('stop45', ('int', )),
                             ('stop46', ('int', )), ('stop47', ('int', )),
                             ('stop48', ('int', )), ('stop49', ('int', )),
                             ('stop50', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_file(
                engine.format_filename("Migrant/Migrants.csv"))

            table = Table("migrantsummary", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('aou', ('int', )), ('count10', ('int', )),
                             ('count20', ('int', )), ('count30', ('int', )),
                             ('count40', ('int', )), ('count50', ('int', )),
                             ('stoptotal', ('int', )),
                             ('speciestotal', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_file(
                engine.format_filename("MigrantSummary/MigrantSummary.csv"))

            table = Table("vehicledata", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('recordedcar', ('char', )), ('car1', ('int', )),
                             ('car2', ('int', )), ('car3', ('int', )),
                             ('car4', ('int', )), ('car5', ('int', )),
                             ('car6', ('int', )), ('car7', ('int', )),
                             ('car8', ('int', )), ('car9', ('int', )),
                             ('car10', ('int', )), ('car11', ('int', )),
                             ('car12', ('int', )), ('car13', ('int', )),
                             ('car14', ('int', )), ('car15', ('int', )),
                             ('car16', ('int', )), ('car17', ('int', )),
                             ('car18', ('int', )), ('car19', ('int', )),
                             ('car20', ('int', )), ('car21', ('int', )),
                             ('car22', ('int', )), ('car23', ('int', )),
                             ('car24', ('int', )), ('car25', ('int', )),
                             ('car26', ('int', )), ('car27', ('int', )),
                             ('car28', ('int', )), ('car29', ('int', )),
                             ('car30', ('int', )), ('car31', ('int', )),
                             ('car32', ('int', )), ('car33', ('int', )),
                             ('car34', ('int', )), ('car35', ('int', )),
                             ('car36', ('int', )), ('car37', ('int', )),
                             ('car38', ('int', )), ('car39', ('int', )),
                             ('car40', ('int', )), ('car41', ('int', )),
                             ('car42', ('int', )), ('car43', ('int', )),
                             ('car44', ('int', )), ('car45', ('int', )),
                             ('car46', ('int', )), ('car47', ('int', )),
                             ('car48', ('int', )), ('car49', ('int', )),
                             ('car50', ('int', )), ('noise1', ('int', )),
                             ('noise2', ('int', )), ('noise3', ('int', )),
                             ('noise4', ('int', )), ('noise5', ('int', )),
                             ('noise6', ('int', )), ('noise7', ('int', )),
                             ('noise8', ('int', )), ('noise9', ('int', )),
                             ('noise10', ('int', )), ('noise11', ('int', )),
                             ('noise12', ('int', )), ('noise13', ('int', )),
                             ('noise14', ('int', )), ('noise15', ('int', )),
                             ('noise16', ('int', )), ('noise17', ('int', )),
                             ('noise18', ('int', )), ('noise19', ('int', )),
                             ('noise20', ('int', )), ('noise21', ('int', )),
                             ('noise22', ('int', )), ('noise23', ('int', )),
                             ('noise24', ('int', )), ('noise25', ('int', )),
                             ('noise26', ('int', )), ('noise27', ('int', )),
                             ('noise28', ('int', )), ('noise29', ('int', )),
                             ('noise30', ('int', )), ('noise31', ('int', )),
                             ('noise32', ('int', )), ('noise33', ('int', )),
                             ('noise34', ('int', )), ('noise35', ('int', )),
                             ('noise36', ('int', )), ('noise37', ('int', )),
                             ('noise38', ('int', )), ('noise39', ('int', )),
                             ('noise40', ('int', )), ('noise41', ('int', )),
                             ('noise42', ('int', )), ('noise43', ('int', )),
                             ('noise44', ('int', )), ('noise45', ('int', )),
                             ('noise46', ('int', )), ('noise47', ('int', )),
                             ('noise48', ('int', )), ('noise49', ('int', )),
                             ('noise50', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.download_files_from_archive(self.urls["Vehicledata"],
                                               archive_name="VehicleData.zip")
            engine.extract_zip(
                engine.format_filename("VehicleData/VehicleData.zip"),
                engine.format_filename("VehicleData"),
            )
            engine.insert_data_from_file(
                engine.format_filename("VehicleData/VehicleData.csv"))

            # Counts table
            table = Table("counts", delimiter=",")
            engine.download_files_from_archive(self.urls["counts"],
                                               archive_name="States.zip")

            table.columns = [("record_id", ("pk-auto", )),
                             ("RouteDataID", ("int", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("Year", ("int", )),
                             ("Aou", ("int", )), ("Count10", ("int", )),
                             ("Count20", ("int", )), ("Count30", ("int", )),
                             ("Count40", ("int", )), ("Count50", ("int", )),
                             ("StopTotal", ("int", )),
                             ("SpeciesTotal", ("int", ))]

            stateslist = [
                "Alabama", "Alaska", "Arizona", "Arkansas", "California",
                "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
                "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
                "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
                "Nevada", ["New Hampshire", "NHampsh"],
                ["New Jersey", "NJersey"], ["New Mexico", "NMexico"],
                ["New York", "NYork"], ["North Carolina", "NCaroli"],
                ["North Dakota",
                 "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
                ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"],
                ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah",
                "Vermont", "Virginia", "Washington",
                ["West Virginia",
                 "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                ["British Columbia", "BritCol"], "Manitoba",
                ["New Brunswick", "NBrunsw"],
                ["Northwest Territories", "NWTerri"], "Newfoundland",
                ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                ["Prince Edward Island",
                 "PEI"], "Quebec", "Saskatchewan", "Yukon"
            ]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if isinstance(state, (list, )):
                        state, shortstate = state[0], state[1]
                    else:
                        shortstate = state[0:7]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.extract_zip(
                            engine.format_filename("States/" + shortstate +
                                                   ".zip"),
                            engine.format_filename("States/" + shortstate),
                        )
                        file_path = "{states}/{shortstate}/{shortstate}.csv".format(
                            states="States", shortstate=shortstate)
                        engine.insert_data_from_file(
                            engine.format_filename(file_path))
                    except:
                        print(state,
                              ": Failed bulk insert on, inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_file(
                            engine.format_filename(file_path))
                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filenames = ['Aquatic_animal_excretion_data.csv',
                     'Aquatic_animal_excretion_variable_descriptions.csv']
        for file_paths in filenames:
            if not os.path.isfile(engine.format_filename(file_paths)):
                url = self.urls["aquatic_animals"]
                engine.download_files_from_archive(url, filenames, "zip")

        # processing Aquatic_animal_excretion_data.csv
        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'
        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("index", ("pk-int",)),
            ("sourcenumber", ("int",)),
            ("sourcename", ("char",)),
            ("speciesname", ("char",)),
            ("speciescode", ("char",)),
            ("invert/vert", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("trophicgild", ("char",)),
            ("drymass", ("double",)),
            ("logdrymass", ("double",)),
            ("ecosystemtype", ("char",)),
            ("energysource", ("char",)),
            ("habitat", ("char",)),
            ("residentecosystem", ("char",)),
            ("temperature", ("double",)),
            ("nexcretionrate", ("double",)),
            ("pexcretionrate", ("double",)),
            ("lognexcretionrate", ("double",)),
            ("logpexcretionrate", ("double",)),
            ("incubationtime", ("double",)),
            ("nform", ("char",)),
            ("pform", ("char",)),
            ("bodyc", ("double",)),
            ("bodyn", ("double",)),
            ("bodyp", ("double",)),
            ("bodyc:n", ("double",)),
            ("bodyc:p", ("double",)),
            ("bodyn:p", ("double",)),
            ("bodydatasource", ("char",)),
            ("datasource", ("char",)),
            ("dataproviders", ("char",))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

        # processing Aquatic_animal_excretion_variable_descriptions.csv
        filename = 'Aquatic_animal_excretion_variable_descriptions.csv'
        tablename = 'variable_descriptions'
        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("Column", ("char",)),
            ("Variable", ("char",)),
            ("Description", ("char",)),
            ("Data Class", ("char",)),
            ("Units", ("char",)),
            ("Minimum_value", ("char",)),
            ("Maximum_value", ("char",)),
            ("Possible_values", ("char",)),
            ("Missing_data_symbol", ("char",)),
            ("Notes", ("char",))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))
Ejemplo n.º 51
0
 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     engine = self.engine
     filename = "database.csv"
     tablename = "predicts_main"
     table = Table(str(tablename), delimiter=',')
     table.columns = [
         ("Source_ID", ("char", )), ("Reference", ("char", )),
         ("Study_number", ("int", )), ("Study_name", ("char", )),
         ("SS", ("char", )), ("Diversity_metric", ("char", )),
         ("Diversity_metric_unit", ("char", )),
         ("Diversity_metric_type", ("char", )),
         ("Diversity_metric_is_effort_sensitive", ("char", )),
         ("Diversity_metric_is_suitable_for_Chao", ("char", )),
         ("Sampling_method", ("char", )),
         ("Sampling_effort_unit", ("char", )),
         ("Study_common_taxon", ("char", )),
         ("Rank_of_study_common_taxon", ("char", )),
         ("Site_number", ("int", )), ("Site_name", ("char", )),
         ("Block", ("char", )), ("SSS", ("char", )), ("SSB", ("char", )),
         ("SSBS", ("char", )), ("Sample_start_earliest", ("char", )),
         ("Sample_end_latest", ("char", )), ("Sample_midpoint", ("char", )),
         ("Sample_date_resolution", ("char", )),
         ("Max_linear_extent_metres", ("double", )),
         ("Habitat_patch_area_square_metres", ("double", )),
         ("Sampling_effort", ("double", )),
         ("Rescaled_sampling_effort", ("double", )),
         ("Habitat_as_described", ("char", )),
         ("Predominant_land_use", ("char", )),
         ("Source_for_predominant_land_use", ("char", )),
         ("Use_intensity", ("char", )),
         ("Km_to_nearest_edge_of_habitat", ("double", )),
         ("Years_since_fragmentation_or_conversion", ("double", )),
         ("Transect_details", ("char", )),
         ("Coordinates_method", ("char", )), ("Longitude", ("double", )),
         ("Latitude", ("double", )),
         ("Country_distance_metres", ("double", )), ("Country", ("char", )),
         ("UN_subregion", ("char", )), ("UN_region", ("char", )),
         ("Ecoregion_distance_metres", ("double", )),
         ("Ecoregion", ("char", )), ("Biome", ("char", )),
         ("Realm", ("char", )), ("Hotspot", ("char", )),
         ("Wilderness_area", ("char", )), ("N_samples", ("double", )),
         ("Taxon_number", ("double", )), ("Taxon_name_entered", ("char", )),
         ("Indication", ("char", )), ("Parsed_name", ("char", )),
         ("Taxon", ("char", )), ("COL_ID", ("double", )),
         ("Name_status", ("char", )), ("Rank", ("char", )),
         ("Kingdom", ("char", )), ("Phylum", ("char", )),
         ("Class", ("char", )), ("Order", ("char", )),
         ("Family", ("char", )), ("Genus", ("char", )),
         ("Species", ("char", )), ("Best_guess_binomial", ("char", )),
         ("Higher_taxa", ("char", )), ("Higher_taxon", ("char", )),
         ("Measurement", ("double", )),
         ("Effort_corrected_measurement", ("double", ))
     ]
     engine.table = table
     if not os.path.isfile(engine.format_filename(filename)):
         engine.download_files_from_archive(self.urls["PREDICTS"],
                                            [filename], "zip", False,
                                            "download.zip")
     engine.create_table()
     engine.insert_data_from_file(engine.format_filename(str(filename)))
Ejemplo n.º 52
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns=[("species_id",               ("pk-int",)         ),
                           ("AOU",                      ("int",)            ),
                           ("english_common_name",      ("char",50)         ),
                           ("french_common_name",       ("char",50)         ),
                           ("spanish_common_name",      ("char",50)         ),
                           ("sporder",                  ("char",30)         ),
                           ("family",                   ("char",30)         ),
                           ("genus",                    ("char",30)         ),
                           ("species",                  ("char",50)         ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns=[("record_id"             ,   ("pk-auto",)    ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("Year"                  ,   ("int",)        ),
                           ("Aou"                   ,   ("int",)        ),
                           ("Count10"               ,   ("int",)        ),
                           ("Count20"               ,   ("int",)        ),
                           ("Count30"               ,   ("int",)        ),
                           ("Count40"               ,   ("int",)        ),
                           ("Count50"               ,   ("int",)        ),
                           ("StopTotal"             ,   ("int",)        ),
                           ("SpeciesTotal"          ,   ("int",)        )]

            stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
                          "Connecticut", "Delaware", "Florida", "Georgia", "Idaho",
                          "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
                          "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                          "Missouri", "Montana", "Nebraska", "Nevada",
                          ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"],
                          ["New Mexico", "NMexico"], ["New York", "NYork"],
                          ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio",
                          "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"],
                          ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee",
                          "Texas", "Utah", "Vermont", "Virginia", "Washington",
                          ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                          ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"],
                          ["Northwest Territories", "NWTerri"], "Newfoundland",
                          ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                          ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state + ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine
Ejemplo n.º 53
0
 def download(self, engine=None, debug=False, ):
     if engine.name != "Download Only":
         raise Exception(
             "The mammal-super-tree dataset contains only non-tabular data files, and can only be used with the 'download only' engine.")
     Script.download(self, engine, debug)
Ejemplo n.º 54
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.auto_create_table(Table("sites"),
                                      url=self.urls["sites"],
                                      filename='gentry_sites.csv')
        self.engine.insert_data_from_url(self.urls["sites"])

        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(
            self.engine.format_filename("all_Excel.zip"))
        filelist = local_zip.namelist()
        local_zip.close()
        self.engine.download_files_from_archive(self.urls["stems"], filelist)

        filelist = [os.path.basename(filename) for filename in filelist]

        # Currently all_Excel.zip is missing CURUYUQU.xls
        # Download it separately and add it to the file list
        if not self.engine.find_file('CURUYUQU.xls'):
            self.engine.download_file(
                "http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls",
                "CURUYUQU.xls")
            filelist.append('CURUYUQU.xls')

        lines = []
        tax = []
        for filename in filelist:
            print("Extracting data from " + filename + "...")
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for colnum, c in enumerate(sh.row(0)):
                if not Excel.empty_cell(c):
                    cid = c.value.lower().strip()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # in QUIAPACA.xls the "number of individuals" column is
                    # misnamed "STEMDBH" just like the stems columns, so weep
                    # for the state of scientific data and then fix manually
                    if filename == "QUIAPACA.xls" and colnum == 13:
                        cid = "count"

                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid or "dbh" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in list(cn.keys()):
                cn["liana"] = -1
            if not "count" in list(cn.keys()):
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if not all(Excel.empty_cell(cell) for cell in row):
                    try:
                        this_line = {}

                        # get the following information from the appropriate columns
                        for i in [
                                "line", "family", "genus", "species", "liana",
                                "count"
                        ]:
                            if cn[i] > -1:
                                if row[cn[i]].ctype != 2:
                                    # if the cell type(ctype) is not a number
                                    this_line[i] = row[
                                        cn[i]].value.lower().strip().replace(
                                            "\\", "/").replace('"', '')
                                else:
                                    this_line[i] = row[cn[i]].value
                                if this_line[i] == '`':
                                    this_line[i] = 1
                        this_line["stems"] = [
                            row[c] for c in cn["stems"]
                            if not Excel.empty_cell(row[c])
                        ]
                        this_line["site"] = filename[0:-4]

                        # Manually correct CEDRAL data, which has a single line
                        # that is shifted by one to the left starting at Liana
                        if this_line["site"] == "CEDRAL" and type(
                                this_line["liana"]) == float:
                            this_line["liana"] = ""
                            this_line["count"] = 3
                            this_line["stems"] = [2.5, 2.5, 30, 18, 25]

                        lines.append(this_line)

                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append(
                            (this_line["family"], this_line["genus"],
                             this_line["species"], id_level, str(full_id)))
                    except:
                        raise
                        pass

        tax = sorted(
            tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = {}
        tax_count = 0

        # Get all unique families/genera/species
        print("\n")
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(
                        tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.flush()
                    sys.stdout.write(msg + "\b" * len(msg))
        print("\n")
        # Create species table
        table = Table("species", delimiter=",")
        table.columns = [("species_id", ("pk-int", )), ("family", ("char", )),
                         ("genus", ("char", )), ("species", ("char", )),
                         ("id_level", ("char", 10)), ("full_id", ("int", ))]

        data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        # Create stems table
        table = Table("stems", delimiter=",")
        table.columns = [("stem_id", ("pk-auto", )), ("line", ("int", )),
                         ("species_id", ("int", )),
                         ("site_code", ("char", 12)), ("liana", ("char", 10)),
                         ("stem", ("double", ))]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [
                line["line"],
                tax_dict[(line["family"], line["genus"], line["species"])],
                line["site"], liana
            ]
            try:
                counts.append(
                    [value for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [str(i)]
                stems.append(stem)

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(stems)

        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns = [("count_id", ("pk-auto", )), ("line", ("int", )),
                         ("species_id", ("int", )),
                         ("site_code", ("char", 12)), ("liana", ("char", 10)),
                         ("count", ("double", ))]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(counts)

        return self.engine
Ejemplo n.º 55
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=6)

            table.columns=[("species_id", ("pk-int",) ),
                           ("AOU", ("int",) ),
                           ("english_common_name", ("char",50) ),
                           ("french_common_name", ("char",50) ),
                           ("spanish_common_name", ("char",50) ),
                           ("sporder", ("char",30) ),
                           ("family", ("char",30) ),
                           ("genus", ("char",30) ),
                           ("species", ("char",50) ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            if not os.path.isfile(engine.format_filename("routes_new.csv")):
                engine.download_files_from_archive(self.urls["routes"],
                                                   ["routes.csv"])
                read = open(engine.format_filename("routes.csv"), "rb")
                write = open(engine.format_filename("routes_new.csv"), "wb")
                print "Cleaning routes data..."
                write.write(read.readline())
                for line in read:
                    values = line.split(',')
                    v = Decimal(values[5])
                    if  v > 0:
                        values[5] = str(v * Decimal("-1"))
                    write.write(','.join(str(value) for value in values))
                write.close()
                read.close()

            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes_new.csv")

            engine.insert_data_from_file(engine.format_filename("routes_new.csv"))


            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open(engine.format_filename("weather.csv"), "rb")
                write = open(engine.format_filename("weather_new.csv"), "wb")
                print "Cleaning weather data..."
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))


            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])
            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in replace.keys():
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns=[("RouteDataID"           ,   ("int",)        ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print "Inserting data from part " + part + "..."
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print "Failed bulk insert on " + part + ", inserting manually."
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print "There was an error in part " + part + "."
                    raise


        except zipfile.BadZipfile:
            print "There was an unexpected error in the Breeding Bird Survey archives."
            raise

        return engine
Ejemplo n.º 56
0
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=9)

            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes",
                          pk=False,
                          header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"
                }
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int", )),
                             ("regioncode", ("int", )),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns = [("record_id", ("pk-auto", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("Year", ("int", )),
                             ("Aou", ("int", )), ("Count10", ("int", )),
                             ("Count20", ("int", )), ("Count30", ("int", )),
                             ("Count40", ("int", )), ("Count50", ("int", )),
                             ("StopTotal", ("int", )),
                             ("SpeciesTotal", ("int", ))]

            stateslist = [
                "Alabama", "Alaska", "Arizona", "Arkansas", "California",
                "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
                "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
                "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
                "Nevada", ["New Hampshire", "NHampsh"],
                ["New Jersey", "NJersey"], ["New Mexico", "NMexico"],
                ["New York", "NYork"], ["North Carolina", "NCaroli"],
                ["North Dakota",
                 "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
                ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"],
                ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah",
                "Vermont", "Virginia", "Washington",
                ["West Virginia",
                 "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                ["British Columbia", "BritCol"], "Manitoba",
                ["New Brunswick", "NBrunsw"],
                ["Northwest Territories", "NWTerri"], "Newfoundland",
                ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                ["Prince Edward Island",
                 "PEI"], "Quebec", "Saskatchewan", "Yukon"
            ]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state +
                              ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine
Ejemplo n.º 57
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int",)),
                         ("Family", ("char",)),
                         ("Binomial", ("char",)),
                         ("Wood_Density", ("double",)),
                         ("Region", ("char",)),
                         ("Reference_Number", ("int",))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine
Ejemplo n.º 58
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        
        self.engine.auto_create_table(Table("sites"), url=self.urls["sites"])
        self.engine.insert_data_from_url(self.urls["sites"])
              
        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip"))        
        filelist = local_zip.namelist()
        local_zip.close()        
        self.engine.download_files_from_archive(self.urls["stems"], filelist)
        
        filelist = [os.path.basename(filename) for filename in filelist]
        
        lines = []
        tax = []
        for filename in filelist:
            print "Extracting data from " + filename + "..."
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for c in sh.row(0):
                if not Excel.empty_cell(c):
                    cid = Excel.cell_value(c).lower()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in cn.keys():
                cn["liana"] = -1
            if not "count" in cn.keys():
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if cellcount > 4 and not Excel.empty_cell(row[0]):
                    try:
                        this_line = {}
                        
                        def format_value(s):
                            s = Excel.cell_value(s)
                            return str(s).title().replace("\\", "/").replace('"', '')
                        
                        # get the following information from the appropriate columns
                        for i in ["line", "family", "genus", "species", 
                                  "liana", "count"]:
                            if cn[i] > -1:
                                this_line[i] = format_value(row[cn[i]])
                                if this_line[i] == '`':
                                    this_line[i] = 1

                        this_line["stems"] = [Excel.cell_value(row[c]) 
                                              for c in cn["stems"]
                                              if not Excel.empty_cell(row[c])]
                        this_line["site"] = filename[0:-4]
                        
                        lines.append(this_line)
                        
                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append((this_line["family"], 
                                    this_line["genus"], 
                                    this_line["species"].lower().replace('\\', '').replace('"', ''), 
                                    id_level, 
                                    str(full_id)))
                    except:
                        raise
                        pass                    
        
        tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = dict()
        tax_count = 0
        
        # Get all unique families/genera/species
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.write(msg + "\b" * len(msg))
        print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS)
        
        
        # Create species table
        table = Table("species", delimiter=",")
        table.columns=[("species_id"            ,   ("pk-int",)    ),
                       ("family"                ,   ("char", )    ),
                       ("genus"                 ,   ("char", )    ),
                       ("species"               ,   ("char", )    ),
                       ("id_level"              ,   ("char", 10)    ),
                       ("full_id"               ,   ("bool",)       )]

        data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) 
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True
        
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create stems table
        table = Table("stems", delimiter=",", contains_pk=False)
        table.columns=[("stem_id"               ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("stem"                  ,   ("double",)     )]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [line["line"], 
                            tax_dict[(line["family"], 
                                      line["genus"], 
                                      line["species"].lower())],
                            line["site"],
                            liana
                            ]
            try:
                counts.append([str(value) for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [i]
                stems.append([str(value) for value in stem])
            
        data = [','.join(stem) for stem in stems]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns=[("count_id"              ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("count"                 ,   ("double",)     )]
        data = [','.join(count) for count in counts]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
            
        return self.engine