Python Table.columns Exemples, retriever.lib.models.Table.columns Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : aquatic_animal_excretion.py Projet : shubhank-saxena/retriever-recipes

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filenames = [
            'Aquatic_animal_excretion_data.csv',
            'Aquatic_animal_excretion_variable_descriptions.csv'
        ]
        for file_paths in filenames:
            if not os.path.isfile(engine.format_filename(file_paths)):
                url = self.urls["aquatic_animals"]
                engine.download_files_from_archive(url, filenames, "zip")

        # processing Aquatic_animal_excretion_data.csv
        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'
        table = Table(str(tablename), delimiter=',')
        table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )),
                         ("sourcename", ("char", )),
                         ("speciesname", ("char", )),
                         ("speciescode", ("char", )),
                         ("invert/vert", ("char", )), ("phylum", ("char", )),
                         ("class", ("char", )), ("order", ("char", )),
                         ("family", ("char", )), ("trophicgild", ("char", )),
                         ("drymass", ("double", )),
                         ("logdrymass", ("double", )),
                         ("ecosystemtype", ("char", )),
                         ("energysource", ("char", )), ("habitat", ("char", )),
                         ("residentecosystem", ("char", )),
                         ("temperature", ("double", )),
                         ("nexcretionrate", ("double", )),
                         ("pexcretionrate", ("double", )),
                         ("lognexcretionrate", ("double", )),
                         ("logpexcretionrate", ("double", )),
                         ("incubationtime", ("double", )),
                         ("nform", ("char", )), ("pform", ("char", )),
                         ("bodyc", ("double", )), ("bodyn", ("double", )),
                         ("bodyp", ("double", )), ("bodyc:n", ("double", )),
                         ("bodyc:p", ("double", )), ("bodyn:p", ("double", )),
                         ("bodydatasource", ("char", )),
                         ("datasource", ("char", )),
                         ("dataproviders", ("char", ))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

        # processing Aquatic_animal_excretion_variable_descriptions.csv
        filename = 'Aquatic_animal_excretion_variable_descriptions.csv'
        tablename = 'variable_descriptions'
        table = Table(str(tablename), delimiter=',')
        table.columns = [("Column", ("char", )), ("Variable", ("char", )),
                         ("Description", ("char", )),
                         ("Data Class", ("char", )), ("Units", ("char", )),
                         ("Minimum_value", ("char", )),
                         ("Maximum_value", ("char", )),
                         ("Possible_values", ("char", )),
                         ("Missing_data_symbol", ("char", )),
                         ("Notes", ("char", ))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #2

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        # structure_plot_year table
        self.engine.auto_create_table(Table("structure_plot_year"), url=self.urls["structure_plot_year"])
        self.engine.insert_data_from_url(self.urls["structure_plot_year"])

        # structure_plot_year table
        self.engine.auto_create_table(Table("plots"), url=self.urls["plots"])
        self.engine.insert_data_from_url(self.urls["plots"])

        # species table
        self.engine.download_file(self.urls["species"], "original_MSH_SPECIES_DESCRIPTORS.csv")
        data_path = self.engine.format_filename("MSH_SPECIES_DESCRIPTORS.csv")

        old_data = os.path.normpath(self.engine.find_file("original_MSH_SPECIES_DESCRIPTORS.csv"))

        with open(old_data, 'rU') as infile, open(data_path, 'w')as new_data:
            for line in infile:
                line = str(line).encode('utf-8')
                new_data.write(line)
        infile.close()
        new_data.close()

        self.engine.auto_create_table(Table("species"),
                                      filename="MSH_SPECIES_DESCRIPTORS.csv")
        self.engine.insert_data_from_file(data_path)

        # species_plot_year tables
        table = Table("species_plot_year")
        table.delimiter = ','
        table.columns = [
            ('record_id', ('pk-auto',)),
            ('plot_id_year', ('char',)),
            ('plot_name', ('char',)),
            ('plot_number', ('int',)),
            ('year', ('int',)),
            ('species', ('ct_column',)),
            ('count', ('ct-double',))
        ]

        table.ct_column = 'species'
        table.ct_names = ['Abilas', 'Abipro', 'Achmil', 'Achocc', 'Agoaur', 'Agrexa', 'Agrpal', 'Agrsca', 'Alnvir',
                          'Anamar', 'Antmic', 'Antros', 'Aqifor', 'Arcnev', 'Arnlat', 'Astled', 'Athdis', 'Blespi',
                          'Brocar', 'Brosit', 'Carmer', 'Carmic', 'Carpac', 'Carpay', 'Carpha', 'Carros', 'Carspe',
                          'Casmin', 'Chaang', 'Cirarv', 'Cisumb', 'Crycas', 'Danint', 'Descae', 'Elyely', 'Epiana',
                          'Eriova', 'Eripyr', 'Fesocc', 'Fravir', 'Gencal', 'Hiealb', 'Hiegra', 'Hyprad', 'Junmer',
                          'Junpar', 'Juncom', 'Leppun', 'Lommar', 'Luepec', 'Luihyp', 'Luplat', 'Luplep', 'Luzpar',
                          'Maiste', 'Pencar', 'Pencon', 'Penser', 'Phahas', 'Phlalp', 'Phldif', 'Phyemp', 'Pincon',
                          'Poasec', 'Poldav', 'Polmin', 'Pollon', 'Poljun', 'Popbal', 'Potarg', 'Psemen', 'Raccan',
                          'Rumace', 'Salsit', 'Saxfer', 'Senspp', 'Sibpro', 'Sorsit', 'Spiden', 'Trispi', 'Tsumer',
                          'Vacmem', 'Vervir', 'Vioadu', 'Xerten']

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_url(self.urls["species_plot_year"])

Exemple #3

0

Afficher le fichier

Fichier : forest_plots_wghats.py Projet : mcglinnlab/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"]
        engine.download_files_from_archive(self.urls["data"], files, filetype="zip")

        # Create table species
        engine.auto_create_table(Table('species', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])),
                                 filename="Species_list.txt")
        engine.insert_data_from_file(engine.format_filename("Species_list.txt"))

        # Create table sites
        engine.auto_create_table(Table('sites', cleanup=Cleanup(correct_invalid_value, nulls=['NA'])),
                                 filename="Site_variables.txt")
        engine.insert_data_from_file(engine.format_filename("Site_variables.txt"))

        # Create table microplots
        table = Table('microplots')
        table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))]
        table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9',
                          'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17',
                          'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26',
                          'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35',
                          'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46',
                          'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54',
                          'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62',
                          'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70',
                          'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79',
                          'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88',
                          'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96',
                          'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104']
        table.ct_column = 'PlotID'
        engine.auto_create_table(table, filename="Microplot_data.txt")
        engine.insert_data_from_file(engine.format_filename("Microplot_data.txt"))

        # Create table microplots
        table = Table('macroplots')
        table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5']
        table.ct_column = 'Tree'
        table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')),
                         ('Girth', ('ct-int',))]
        engine.auto_create_table(table, filename="Macroplot_data_Rev.txt")
        engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))

Exemple #4

0

Afficher le fichier

Fichier : forest_plots_wghats.py Projet : dmcglinn/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        files = ["Macroplot_data_Rev.txt", "Microplot_data.txt", "Site_variables.txt", "Species_list.txt"]
        engine.download_files_from_archive(self.urls["data"], files, filetype="zip")

        # Create table species
        engine.auto_create_table(Table('species', cleanup=self.cleanup_func_table),
                                 filename="Species_list.txt")
        engine.insert_data_from_file(engine.format_filename("Species_list.txt"))

        # Create table sites
        engine.auto_create_table(Table('sites', cleanup=self.cleanup_func_table),
                                 filename="Site_variables.txt")
        engine.insert_data_from_file(engine.format_filename("Site_variables.txt"))

        # Create table microplots
        table = Table('microplots')
        table.columns = [('record_id', ('pk-auto',)), ('SpCode', ('char', '30')), ('Count', ('ct-int',))]
        table.ct_names = ['BSP1', 'BSP2', 'BSP3', 'BSP4', 'BSP5', 'BSP6', 'BSP7', 'BSP8', 'BSP9',
                          'BSP10', 'BSP11', 'BSP12', 'BSP13', 'BSP14', 'BSP15', 'BSP16', 'BSP17',
                          'BSP18', 'BSP20', 'BSP21', 'BSP22', 'BSP23', 'BSP24', 'BSP25', 'BSP26',
                          'BSP27', 'BSP28', 'BSP29', 'BSP30', 'BSP31', 'BSP33', 'BSP34', 'BSP35',
                          'BSP36', 'BSP37', 'BSP41', 'BSP42', 'BSP43', 'BSP44', 'BSP45', 'BSP46',
                          'BSP47', 'BSP48', 'BSP49', 'BSP50', 'BSP51', 'BSP52', 'BSP53', 'BSP54',
                          'BSP55', 'BSP56', 'BSP57', 'BSP58', 'BSP59', 'BSP60', 'BSP61', 'BSP62',
                          'BSP63', 'BSP64', 'BSP65', 'BSP66', 'BSP67', 'BSP68', 'BSP69', 'BSP70',
                          'BSP71', 'BSP72', 'BSP73', 'BSP74', 'BSP75', 'BSP76', 'BSP78', 'BSP79',
                          'BSP80', 'BSP82', 'BSP83', 'BSP84', 'BSP85', 'BSP86', 'BSP87', 'BSP88',
                          'BSP89', 'BSP90', 'BSP91', 'BSP92', 'BSP93', 'BSP94', 'BSP95', 'BSP96',
                          'BSP97', 'BSP98', 'BSP99', 'BSP100', 'BSP101', 'BSP102', 'BSP104']
        table.ct_column = 'PlotID'
        engine.auto_create_table(table, filename="Microplot_data.txt")
        engine.insert_data_from_file(engine.format_filename("Microplot_data.txt"))

        # Create table microplots
        table = Table('macroplots')
        table.ct_names = ['TreeGirth1', 'TreeGirth2', 'TreeGirth3', 'TreeGirth4', 'TreeGirth5']
        table.ct_column = 'Tree'
        table.columns = [('record_id', ('pk-auto',)), ('PlotID', ('char', '20')), ('SpCode', ('char', '30')),
                         ('Girth', ('ct-int',))]
        engine.auto_create_table(table, filename="Macroplot_data_Rev.txt")
        engine.insert_data_from_file(engine.format_filename("Macroplot_data_Rev.txt"))

Exemple #5

0

Afficher le fichier

Fichier : aquatic_animal_excretion.py Projet : dmcglinn/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("index", ("pk-int",)),
            ("sourcenumber", ("int",)),
            ("sourcename", ("char",)),
            ("speciesname", ("char",)),
            ("speciescode", ("char",)),
            ("invert/vert", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("trophicgild", ("char",)),
            ("drymass", ("double",)),
            ("logdrymass", ("double",)),
            ("ecosystemtype", ("char",)),
            ("energysource", ("char",)),
            ("habitat", ("char",)),
            ("residentecosystem", ("char",)),
            ("temperature", ("double",)),
            ("nexcretionrate", ("double",)),
            ("pexcretionrate", ("double",)),
            ("lognexcretionrate", ("double",)),
            ("logpexcretionrate", ("double",)),
            ("incubationtime", ("double",)),
            ("nform", ("char",)),
            ("pform", ("char",)),
            ("bodyc", ("double",)),
            ("bodyn", ("double",)),
            ("bodyp", ("double",)),
            ("bodyc:n", ("double",)),
            ("bodyc:p", ("double",)),
            ("bodyn:p", ("double",)),
            ("bodydatasource", ("char",)),
            ("datasource", ("char",)),
            ("dataproviders", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip")

        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #6

0

Afficher le fichier

Fichier : npn.py Projet : raj-maurya/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine

        taxa = ('Plant', 'Animal')

        for tax in taxa:
            table = Table(tax.lower() + 's', delimiter=',', header_rows = 3, pk='record_id', contains_pk=True)

            columns =     [("record_id"             ,   ("pk-int",)     ),
                           ("station_id"            ,   ("int",)        ),
                           ("obs_date"              ,   ("char",)       ),
                           ("ind_id"                ,   ("int",)        ),
                           ("sci_name"              ,   ("char",)       ),
                           ("com_name"              ,   ("char",)       ),
                           ("kingdom"               ,   ("char",)       ),
                           ("pheno_cat"             ,   ("char",)       ),
                           ("pheno_name"            ,   ("char",)       ),
                           ("pheno_status"          ,   ("char",)       ),
                           ("lat"                   ,   ("double",)     ),
                           ("lon"                   ,   ("double",)     ),
                           ("elevation"             ,   ("int",)        ),
                           ("network_name"          ,   ("char",)       )]
            table.columns = columns

            engine.table = table
            engine.create_table()

            base_url = 'http://www.usanpn.org/getObs/observations/'
            years = range(2009, 2013)

            for year in years:
                if year == 2009 and tax == 'Animal': continue

                url = base_url + 'get%s%sDataNoDefinitions' % (year, tax)

                filename = '%s_%s.csv' % (tax, year)
                engine.download_file(url, filename)

                engine.insert_data_from_file(engine.find_file(filename))

        return engine

Exemple #7

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [("index", ("pk-int", )), ("sourcenumber", ("int", )),
                         ("sourcename", ("char", )),
                         ("speciesname", ("char", )),
                         ("speciescode", ("char", )),
                         ("invert/vert", ("char", )), ("phylum", ("char", )),
                         ("class", ("char", )), ("order", ("char", )),
                         ("family", ("char", )), ("trophicgild", ("char", )),
                         ("drymass", ("double", )),
                         ("logdrymass", ("double", )),
                         ("ecosystemtype", ("char", )),
                         ("energysource", ("char", )), ("habitat", ("char", )),
                         ("residentecosystem", ("char", )),
                         ("temperature", ("double", )),
                         ("nexcretionrate", ("double", )),
                         ("pexcretionrate", ("double", )),
                         ("lognexcretionrate", ("double", )),
                         ("logpexcretionrate", ("double", )),
                         ("incubationtime", ("double", )),
                         ("nform", ("char", )), ("pform", ("char", )),
                         ("bodyc", ("double", )), ("bodyn", ("double", )),
                         ("bodyp", ("double", )), ("bodyc:n", ("double", )),
                         ("bodyc:p", ("double", )), ("bodyn:p", ("double", )),
                         ("bodydatasource", ("char", )),
                         ("datasource", ("char", )),
                         ("dataproviders", ("char", ))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename],
                                               [filename],
                                               filetype="zip")

        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #8

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=11)
            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"],
                                               archive_name="routes.zip")
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            engine.download_files_from_archive(self.urls["weather"],
                                               ["weather.csv"],
                                               archive_name="weather.zip")
            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather.csv")
            engine.insert_data_from_file(engine.format_filename("weather.csv"))

            # Migrations data
            engine.download_files_from_archive(
                self.urls["migrants"], archive_name="MigrantNonBreeder.zip")
            engine.extract_zip(
                engine.format_filename("MigrantNonBreeder/Migrants.zip"),
                engine.format_filename("Migrant"),
            )
            engine.extract_zip(
                engine.format_filename("MigrantNonBreeder/MigrantSummary.zip"),
                engine.format_filename("MigrantSummary"),
            )

            table = Table("migrants", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('aou', ('int', )), ('stop1', ('int', )),
                             ('stop2', ('int', )), ('stop3', ('int', )),
                             ('stop4', ('int', )), ('stop5', ('int', )),
                             ('stop6', ('int', )), ('stop7', ('int', )),
                             ('stop8', ('int', )), ('stop9', ('int', )),
                             ('stop10', ('int', )), ('stop11', ('int', )),
                             ('stop12', ('int', )), ('stop13', ('int', )),
                             ('stop14', ('int', )), ('stop15', ('int', )),
                             ('stop16', ('int', )), ('stop17', ('int', )),
                             ('stop18', ('int', )), ('stop19', ('int', )),
                             ('stop20', ('int', )), ('stop21', ('int', )),
                             ('stop22', ('int', )), ('stop23', ('int', )),
                             ('stop24', ('int', )), ('stop25', ('int', )),
                             ('stop26', ('int', )), ('stop27', ('int', )),
                             ('stop28', ('int', )), ('stop29', ('int', )),
                             ('stop30', ('int', )), ('stop31', ('int', )),
                             ('stop32', ('int', )), ('stop33', ('int', )),
                             ('stop34', ('int', )), ('stop35', ('int', )),
                             ('stop36', ('int', )), ('stop37', ('int', )),
                             ('stop38', ('int', )), ('stop39', ('int', )),
                             ('stop40', ('int', )), ('stop41', ('int', )),
                             ('stop42', ('int', )), ('stop43', ('int', )),
                             ('stop44', ('int', )), ('stop45', ('int', )),
                             ('stop46', ('int', )), ('stop47', ('int', )),
                             ('stop48', ('int', )), ('stop49', ('int', )),
                             ('stop50', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_file(
                engine.format_filename("Migrant/Migrants.csv"))

            table = Table("migrantsummary", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('aou', ('int', )), ('count10', ('int', )),
                             ('count20', ('int', )), ('count30', ('int', )),
                             ('count40', ('int', )), ('count50', ('int', )),
                             ('stoptotal', ('int', )),
                             ('speciestotal', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_file(
                engine.format_filename("MigrantSummary/MigrantSummary.csv"))

            table = Table("vehicledata", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('recordedcar', ('char', )), ('car1', ('int', )),
                             ('car2', ('int', )), ('car3', ('int', )),
                             ('car4', ('int', )), ('car5', ('int', )),
                             ('car6', ('int', )), ('car7', ('int', )),
                             ('car8', ('int', )), ('car9', ('int', )),
                             ('car10', ('int', )), ('car11', ('int', )),
                             ('car12', ('int', )), ('car13', ('int', )),
                             ('car14', ('int', )), ('car15', ('int', )),
                             ('car16', ('int', )), ('car17', ('int', )),
                             ('car18', ('int', )), ('car19', ('int', )),
                             ('car20', ('int', )), ('car21', ('int', )),
                             ('car22', ('int', )), ('car23', ('int', )),
                             ('car24', ('int', )), ('car25', ('int', )),
                             ('car26', ('int', )), ('car27', ('int', )),
                             ('car28', ('int', )), ('car29', ('int', )),
                             ('car30', ('int', )), ('car31', ('int', )),
                             ('car32', ('int', )), ('car33', ('int', )),
                             ('car34', ('int', )), ('car35', ('int', )),
                             ('car36', ('int', )), ('car37', ('int', )),
                             ('car38', ('int', )), ('car39', ('int', )),
                             ('car40', ('int', )), ('car41', ('int', )),
                             ('car42', ('int', )), ('car43', ('int', )),
                             ('car44', ('int', )), ('car45', ('int', )),
                             ('car46', ('int', )), ('car47', ('int', )),
                             ('car48', ('int', )), ('car49', ('int', )),
                             ('car50', ('int', )), ('noise1', ('int', )),
                             ('noise2', ('int', )), ('noise3', ('int', )),
                             ('noise4', ('int', )), ('noise5', ('int', )),
                             ('noise6', ('int', )), ('noise7', ('int', )),
                             ('noise8', ('int', )), ('noise9', ('int', )),
                             ('noise10', ('int', )), ('noise11', ('int', )),
                             ('noise12', ('int', )), ('noise13', ('int', )),
                             ('noise14', ('int', )), ('noise15', ('int', )),
                             ('noise16', ('int', )), ('noise17', ('int', )),
                             ('noise18', ('int', )), ('noise19', ('int', )),
                             ('noise20', ('int', )), ('noise21', ('int', )),
                             ('noise22', ('int', )), ('noise23', ('int', )),
                             ('noise24', ('int', )), ('noise25', ('int', )),
                             ('noise26', ('int', )), ('noise27', ('int', )),
                             ('noise28', ('int', )), ('noise29', ('int', )),
                             ('noise30', ('int', )), ('noise31', ('int', )),
                             ('noise32', ('int', )), ('noise33', ('int', )),
                             ('noise34', ('int', )), ('noise35', ('int', )),
                             ('noise36', ('int', )), ('noise37', ('int', )),
                             ('noise38', ('int', )), ('noise39', ('int', )),
                             ('noise40', ('int', )), ('noise41', ('int', )),
                             ('noise42', ('int', )), ('noise43', ('int', )),
                             ('noise44', ('int', )), ('noise45', ('int', )),
                             ('noise46', ('int', )), ('noise47', ('int', )),
                             ('noise48', ('int', )), ('noise49', ('int', )),
                             ('noise50', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.download_files_from_archive(self.urls["Vehicledata"],
                                               archive_name="VehicleData.zip")
            engine.extract_zip(
                engine.format_filename("VehicleData/VehicleData.zip"),
                engine.format_filename("VehicleData"),
            )
            engine.insert_data_from_file(
                engine.format_filename("VehicleData/VehicleData.csv"))

            # Counts table
            table = Table("counts", delimiter=",")
            engine.download_files_from_archive(self.urls["counts"],
                                               archive_name="States.zip")

            table.columns = [("record_id", ("pk-auto", )),
                             ("RouteDataID", ("int", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("Year", ("int", )),
                             ("Aou", ("int", )), ("Count10", ("int", )),
                             ("Count20", ("int", )), ("Count30", ("int", )),
                             ("Count40", ("int", )), ("Count50", ("int", )),
                             ("StopTotal", ("int", )),
                             ("SpeciesTotal", ("int", ))]

            stateslist = [
                "Alabama", "Alaska", "Arizona", "Arkansas", "California",
                "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
                "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
                "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
                "Nevada", ["New Hampshire", "NHampsh"],
                ["New Jersey", "NJersey"], ["New Mexico", "NMexico"],
                ["New York", "NYork"], ["North Carolina", "NCaroli"],
                ["North Dakota",
                 "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
                ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"],
                ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah",
                "Vermont", "Virginia", "Washington",
                ["West Virginia",
                 "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                ["British Columbia", "BritCol"], "Manitoba",
                ["New Brunswick", "NBrunsw"],
                ["Northwest Territories", "NWTerri"], "Newfoundland",
                ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                ["Prince Edward Island",
                 "PEI"], "Quebec", "Saskatchewan", "Yukon"
            ]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if isinstance(state, (list, )):
                        state, shortstate = state[0], state[1]
                    else:
                        shortstate = state[0:7]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.extract_zip(
                            engine.format_filename("States/" + shortstate +
                                                   ".zip"),
                            engine.format_filename("States/" + shortstate),
                        )
                        file_path = "{states}/{shortstate}/{shortstate}.csv".format(
                            states="States", shortstate=shortstate)
                        engine.insert_data_from_file(
                            engine.format_filename(file_path))
                    except:
                        print(state,
                              ": Failed bulk insert on, inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_file(
                            engine.format_filename(file_path))
                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine

Exemple #9

0

Afficher le fichier

Fichier : bbs50stop.py Projet : raj-maurya/retriever

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=6)

            table.columns=[("species_id", ("pk-int",) ),
                           ("AOU", ("int",) ),
                           ("english_common_name", ("char",50) ),
                           ("french_common_name", ("char",50) ),
                           ("spanish_common_name", ("char",50) ),
                           ("sporder", ("char",30) ),
                           ("family", ("char",30) ),
                           ("genus", ("char",30) ),
                           ("species", ("char",50) ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            if not os.path.isfile(engine.format_filename("routes_new.csv")):
                engine.download_files_from_archive(self.urls["routes"],
                                                   ["routes.csv"])
                read = open(engine.format_filename("routes.csv"), "rb")
                write = open(engine.format_filename("routes_new.csv"), "wb")
                print "Cleaning routes data..."
                write.write(read.readline())
                for line in read:
                    values = line.split(',')
                    v = Decimal(values[5])
                    if  v > 0:
                        values[5] = str(v * Decimal("-1"))
                    write.write(','.join(str(value) for value in values))
                write.close()
                read.close()

            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes_new.csv")

            engine.insert_data_from_file(engine.format_filename("routes_new.csv"))


            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open(engine.format_filename("weather.csv"), "rb")
                write = open(engine.format_filename("weather_new.csv"), "wb")
                print "Cleaning weather data..."
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))


            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])
            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in replace.keys():
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns=[("RouteDataID"           ,   ("int",)        ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print "Inserting data from part " + part + "..."
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print "Failed bulk insert on " + part + ", inserting manually."
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print "There was an error in part " + part + "."
                    raise


        except zipfile.BadZipfile:
            print "There was an unexpected error in the Breeding Bird Survey archives."
            raise

        return engine

Exemple #10

0

Afficher le fichier

Fichier : gwdd.py Projet : cotsog/deletedret

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(self.urls["GWDD"],
                                  "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")

        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        #Creating data table
        lines = []
        for i in range(1, rows):
            row = sh.row(i)
            if not all(Excel.empty_cell(cell) for cell in row):
                this_line = {}

                def format_value(s):
                    s = Excel.cell_value(s)
                    return str(s).title().replace("\\", "/").replace('"', '')

                for num, label in enumerate([
                        "Number", "Family", "Binomial", "Wood_Density",
                        "Region", "Reference_Number"
                ]):
                    this_line[label] = format_value(row[num])
                lines.append(this_line)

        table = Table("data", delimiter="\t")
        table.columns = [("Number", ("pk-int", )), ("Family", ("char", )),
                         ("Binomial", ("char", )),
                         ("Wood_Density", ("double", )),
                         ("Region", ("char", )),
                         ("Reference_Number", ("int", ))]
        table.pk = 'Number'
        table.contains_pk = True

        gwdd = []
        for line in lines:
            gwdd_data = [
                line["Number"], line["Family"], line["Binomial"],
                line["Wood_Density"], line["Region"], line["Reference_Number"]
            ]
            gwdd.append(gwdd_data)

        data = ['\t'.join(gwdd_line) for gwdd_line in gwdd]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        #Creating reference table
        lines = []
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for i in range(1, rows):
            row = sh.row(i)
            if not all(Excel.empty_cell(cell) for cell in row):
                this_line = {}

                def format_value(s):
                    s = Excel.cell_value(s)
                    return str(s).title().replace("\\", "/").replace('"', '')

                for num, label in enumerate(["Reference_Number", "Reference"]):
                    this_line[label] = format_value(row[num])
                lines.append(this_line)

        table = Table("reference", delimiter="\t")
        table.columns = [("Reference_Number", ("pk-int", )),
                         ("Reference", ("char", ))]
        table.pk = 'Reference_Number'
        table.contains_pk = True

        gwdd = []
        for line in lines:
            gwdd_ref = [line["Reference_Number"], line["Reference"]]
            gwdd.append(gwdd_ref)

        data = ['\t'.join(gwdd_line) for gwdd_line in gwdd]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        return self.engine

Exemple #11

0

Afficher le fichier

Fichier : gentry_forest_transects.py Projet : goelakash/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.auto_create_table(Table("sites"), url=self.urls["sites"], filename='gentry_sites.csv')
        self.engine.insert_data_from_url(self.urls["sites"])

        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip"))
        filelist = local_zip.namelist()
        local_zip.close()
        self.engine.download_files_from_archive(self.urls["stems"], filelist)

        filelist = [os.path.basename(filename) for filename in filelist]

        # Currently all_Excel.zip is missing CURUYUQU.xls
        # Download it separately and add it to the file list
        if not self.engine.find_file('CURUYUQU.xls'):
            self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls")
            filelist.append('CURUYUQU.xls')

        lines = []
        tax = []
        for filename in filelist:
            print("Extracting data from " + filename + "...")
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for colnum, c in enumerate(sh.row(0)):
                if not Excel.empty_cell(c):
                    cid = c.value.lower().strip()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # in QUIAPACA.xls the "number of individuals" column is
                    # misnamed "STEMDBH" just like the stems columns, so weep
                    # for the state of scientific data and then fix manually
                    if filename == "QUIAPACA.xls" and colnum == 13:
                        cid = "count"

                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid or "dbh" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in list(cn.keys()):
                cn["liana"] = -1
            if not "count" in list(cn.keys()):
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if not all(Excel.empty_cell(cell) for cell in row):
                    try:
                        this_line = {}

                        # get the following information from the appropriate columns
                        for i in ["line", "family", "genus", "species",
                                  "liana", "count"]:
                            if cn[i] > -1:
                                if row[cn[i]].ctype != 2:
                                    # if the cell type(ctype) is not a number
                                    this_line[i] = row[cn[i]].value.lower().strip().replace("\\", "/").replace('"', '')
                                else:
                                    this_line[i] = row[cn[i]].value
                                if this_line[i] == '`':
                                    this_line[i] = 1
                        this_line["stems"] = [row[c]
                                              for c in cn["stems"]
                                              if not Excel.empty_cell(row[c])]
                        this_line["site"] = filename[0:-4]

                        # Manually correct CEDRAL data, which has a single line
                        # that is shifted by one to the left starting at Liana
                        if this_line["site"] == "CEDRAL" and type(this_line["liana"]) == float:
                            this_line["liana"] = ""
                            this_line["count"] = 3
                            this_line["stems"] = [2.5, 2.5, 30, 18, 25]

                        lines.append(this_line)

                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append((this_line["family"],
                                    this_line["genus"],
                                    this_line["species"],
                                    id_level,
                                    str(full_id)))
                    except:
                        raise
                        pass

        tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = {}
        tax_count = 0

        # Get all unique families/genera/species
        print("\n")
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.flush()
                    sys.stdout.write(msg + "\b" * len(msg))
        print("\n")
        # Create species table
        table = Table("species", delimiter=",")
        table.columns=[("species_id"            ,   ("pk-int",)    ),
                       ("family"                ,   ("char", )    ),
                       ("genus"                 ,   ("char", )    ),
                       ("species"               ,   ("char", )    ),
                       ("id_level"              ,   ("char", 10)    ),
                       ("full_id"               ,   ("int",)       )]

        data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        # Create stems table
        table = Table("stems", delimiter=",")
        table.columns=[("stem_id"               ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("stem"                  ,   ("double",)     )]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [line["line"],
                            tax_dict[(line["family"],
                                      line["genus"],
                                      line["species"])],
                            line["site"],
                            liana
                            ]
            try:
                counts.append([value for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [str(i)]
                stems.append(stem)

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(stems)

        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns=[("count_id"              ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("count"                 ,   ("double",)     )]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(counts)

        return self.engine

Exemple #12

0

Afficher le fichier

Fichier : biotimesql.py Projet : shubhank-saxena/retriever-recipes

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        original_sql_file = "BioTIMESQL02_04_2018.sql"
        engine.download_file(self.urls["sql_file"], original_sql_file)
        sql_data = open_fr(self.engine.format_filename(original_sql_file))

        set_open = False
        csv_writer = None
        csv_file = None
        table_name = None
        NULL = None
        for line in sql_data:
            table_indicator = "-- Table structure for table "
            if line.startswith(table_indicator):
                st = line[len(table_indicator):].replace("`", "")
                table_name = st.strip()
                current_file_process = table_name
                current_file_open = current_file_process
                if set_open and not current_file_process == current_file_open:
                    csv_file.close()
                    set_open = False
                else:
                    out_file = "{name}.csv".format(name=table_name)
                    csv_file = open_fw(engine.format_filename(out_file))
                    csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
                    set_open = True

            if line.startswith("INSERT INTO `{table_name}`".format(
                    table_name=table_name)):
                row_val = line[line.index("VALUES (") + 8:-3]
                table_rows = row_val.replace("\r\n", "").split("),(")
                for i_row in table_rows:
                    v = eval('[' + str(i_row) + ']')
                    csv_writer.writerows([v])
        if csv_file:
            csv_file.close()

        # Create abundance table
        table = Table("ID_ABUNDANCE",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_ABUNDANCE", ("int", )),
            ("ABUNDANCE_TYPE", ("char", "100")),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("abundance.csv"))

        # Create allrawdata table
        table = Table("allrawdata",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_ALL_RAW_DATA", ("int", )),
            ("ABUNDANCE", ("double", )),
            ("BIOMASS", ("double", )),
            ("ID_SPECIES", ("int", )),
            ("SAMPLE_DESC", ("char", 200)),
            ("PLOT", ("char", 150)),
            ("LATITUDE", ("double", )),
            ("LONGITUDE", ("double", )),
            ("DEPTH", ("double", )),
            ("DAY", ("int", )),
            ("MONTH", ("int", )),
            ("YEAR", ("int", )),
            ("STUDY_ID", ("int", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("allrawdata.csv"))

        # Create biomass table
        table = Table("biomass",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [("ID_BIOMASS", ("int", )),
                         ("BIOMASS_TYPE", ("char", "100"))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("biomass.csv"))

        # Create citation1 table
        table = Table("citation1",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_CITATION1", ("int", )),
            ("STUDY_ID", ("int", )),
            ("CITATION_LINE", ("char", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("citation1.csv"))

        # Create contacts table
        table = Table("contacts",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_CONTACTS", ("int", )),
            ("STUDY_ID", ("int", )),
            ("CONTACT_1", ("char", 500)),
            ("CONTACT_2", ("char", 500)),
            ("CONT_1_MAIL", ("char", 60)),
            ("CONT_2_MAIL", ("char", 60)),
            ("LICENSE", ("char", 200)),
            ("WEB_LINK", ("char", 200)),
            ("DATA_SOURCE", ("char", 250)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("contacts.csv"))

        # Create countries table
        table = Table("countries",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [("COUNT_ID", ("int", )),
                         ("COUNTRY_NAME", ("char", 200))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("countries.csv"))

        # Create curation table
        table = Table("curation",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_CURATION", ("int", )),
            ("STUDY_ID", ("int", )),
            ("LINK_ID", ("int", )),
            ("COMMENTS", ("char", )),
            ("DATE_STUDY_ADDED", ("char", 50)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("curation.csv"))

        # Create datasets table
        table = Table("datasets",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_DATASETS", ("int", )),
            ("STUDY_ID", ("int", )),
            ("TAXA", ("char", 50)),
            ("ORGANISMS", ("char", 200)),
            ("TITLE", ("char", 800)),
            ("AB_BIO", ("char", 2)),
            ("HAS_PLOT", ("char", 10)),
            ("DATA_POINTS", ("char", )),
            ("START_YEAR", ("char", )),
            ("END_YEAR", ("char", )),
            ("CENT_LAT", ("double", )),
            ("CENT_LONG", ("double", )),
            ("NUMBER_OF_SPECIES", ("char", )),
            ("NUMBER_OF_SAMPLES", ("char", )),
            ("NUMBER_LAT_LONG", ("char", )),
            ("TOTAL", ("char", )),
            ("GRAIN_SIZE_TEXT", ("char", )),
            ("GRAIN_SQ_KM", ("double", )),
            ("AREA_SQ_KM", ("double", )),
            ("AB_TYPE", ("char", )),
            ("BIO_TYPE", ("char", )),
            ("SAMPLE_TYPE", ("char", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("datasets.csv"))

        # Create downloads table
        table = Table("downloads",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("D_ID", ("int", )),
            ("STUDY", ("char", 25)),
            ("NAME", ("char", 150)),
            ("EMAIL", ("char", 150)),
            ("COUNTRY", ("char", 200)),
            ("ROLE", ("char", 150)),
            ("PURPOSE", ("char", 500)),
            ("LOCATION", ("char", 250)),
            ("DATE_STAMP", ("char", )),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("downloads.csv"))

        # Create methods table
        table = Table("methods",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_METHODS", ("int", )),
            ("STUDY_ID", ("int", )),
            ("METHODS", ("char", )),
            ("SUMMARY_METHODS", ("char", 500)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("methods.csv"))

        # Create sample table
        table = Table("sample",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [
            ("ID_SAMPLE", ("int", )),
            ("ID_TREAT", ("int", )),
            ("SAMPLE_DESC_NAME", ("char", 200)),
        ]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("sample.csv"))

        # Create site table
        table = Table("site", delimiter=",", header_rows=0, contains_pk=False)
        table.columns = [("ID_SITE", ("int", )), ("STUDY_ID", ("int", )),
                         ("REALM", ("char", 11)), ("CLIMATE", ("char", 20)),
                         ("GENERAL_TREAT", ("char", 200)),
                         ("TREATMENT", ("char", 200)),
                         ("TREAT_COMMENTS", ("char", 250)),
                         ("TREAT_DATE", ("char", 100)),
                         ("CEN_LATITUDE", ("double", )),
                         ("CEN_LONGITUDE", ("double", )),
                         ("HABITAT", ("char", 100)),
                         ("PROTECTED_AREA", ("char", 50)),
                         ("AREA", ("double", )), ("BIOME_MAP", ("char", 500))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("site.csv"))

        # Create species table
        table = Table("species",
                      delimiter=",",
                      header_rows=0,
                      contains_pk=False)
        table.columns = [("ID_SPECIES", ("int", )), ("GENUS", ("char", 100)),
                         ("SPECIES", ("char", 100)),
                         ("GENUS_SPECIES", ("char", 100))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename("species.csv"))

Exemple #13

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns=[("species_id",               ("pk-int",)         ),
                           ("AOU",                      ("int",)            ),
                           ("english_common_name",      ("char",50)         ),
                           ("french_common_name",       ("char",50)         ),
                           ("spanish_common_name",      ("char",50)         ),
                           ("sporder",                  ("char",30)         ),
                           ("family",                   ("char",30)         ),
                           ("genus",                    ("char",30)         ),
                           ("species",                  ("char",50)         ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=Cleanup(correct_invalid_value, nulls=['NULL'])),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns=[("record_id"             ,   ("pk-auto",)    ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("Year"                  ,   ("int",)        ),
                           ("Aou"                   ,   ("int",)        ),
                           ("Count10"               ,   ("int",)        ),
                           ("Count20"               ,   ("int",)        ),
                           ("Count30"               ,   ("int",)        ),
                           ("Count40"               ,   ("int",)        ),
                           ("Count50"               ,   ("int",)        ),
                           ("StopTotal"             ,   ("int",)        ),
                           ("SpeciesTotal"          ,   ("int",)        )]

            stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
                          "Connecticut", "Delaware", "Florida", "Georgia", "Idaho",
                          "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
                          "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                          "Missouri", "Montana", "Nebraska", "Nevada",
                          ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"],
                          ["New Mexico", "NMexico"], ["New York", "NYork"],
                          ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio",
                          "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"],
                          ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee",
                          "Texas", "Utah", "Vermont", "Virginia", "Washington",
                          ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                          ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"],
                          ["Northwest Territories", "NWTerri"], "Newfoundland",
                          ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                          ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state + ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] + shortstate + ".zip",
                                                        [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine

Exemple #14

0

Afficher le fichier

Fichier : breed_bird_survey.py Projet : rupesh1798/retriever

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=9)

            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes",
                          pk=False,
                          header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"
                }
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int", )),
                             ("regioncode", ("int", )),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns = [("record_id", ("pk-auto", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("Year", ("int", )),
                             ("Aou", ("int", )), ("Count10", ("int", )),
                             ("Count20", ("int", )), ("Count30", ("int", )),
                             ("Count40", ("int", )), ("Count50", ("int", )),
                             ("StopTotal", ("int", )),
                             ("SpeciesTotal", ("int", ))]

            stateslist = [
                "Alabama", "Alaska", "Arizona", "Arkansas", "California",
                "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
                "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
                "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
                "Nevada", ["New Hampshire", "NHampsh"],
                ["New Jersey", "NJersey"], ["New Mexico", "NMexico"],
                ["New York", "NYork"], ["North Carolina", "NCaroli"],
                ["North Dakota",
                 "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
                ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"],
                ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah",
                "Vermont", "Virginia", "Washington",
                ["West Virginia",
                 "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                ["British Columbia", "BritCol"], "Manitoba",
                ["New Brunswick", "NBrunsw"],
                ["Northwest Territories", "NWTerri"], "Newfoundland",
                ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                ["Prince Edward Island",
                 "PEI"], "Quebec", "Saskatchewan", "Yukon"
            ]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state +
                              ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine

Exemple #15

0

Afficher le fichier

Fichier : predicts.py Projet : harshitbansal05/Retriever-Scripts

 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     engine = self.engine
     filename = "database.csv"
     tablename = "predicts_main"
     table = Table(str(tablename), delimiter=',')
     table.columns = [
         ("Source_ID", ("char", )), ("Reference", ("char", )),
         ("Study_number", ("int", )), ("Study_name", ("char", )),
         ("SS", ("char", )), ("Diversity_metric", ("char", )),
         ("Diversity_metric_unit", ("char", )),
         ("Diversity_metric_type", ("char", )),
         ("Diversity_metric_is_effort_sensitive", ("char", )),
         ("Diversity_metric_is_suitable_for_Chao", ("char", )),
         ("Sampling_method", ("char", )),
         ("Sampling_effort_unit", ("char", )),
         ("Study_common_taxon", ("char", )),
         ("Rank_of_study_common_taxon", ("char", )),
         ("Site_number", ("int", )), ("Site_name", ("char", )),
         ("Block", ("char", )), ("SSS", ("char", )), ("SSB", ("char", )),
         ("SSBS", ("char", )), ("Sample_start_earliest", ("char", )),
         ("Sample_end_latest", ("char", )), ("Sample_midpoint", ("char", )),
         ("Sample_date_resolution", ("char", )),
         ("Max_linear_extent_metres", ("double", )),
         ("Habitat_patch_area_square_metres", ("double", )),
         ("Sampling_effort", ("double", )),
         ("Rescaled_sampling_effort", ("double", )),
         ("Habitat_as_described", ("char", )),
         ("Predominant_land_use", ("char", )),
         ("Source_for_predominant_land_use", ("char", )),
         ("Use_intensity", ("char", )),
         ("Km_to_nearest_edge_of_habitat", ("double", )),
         ("Years_since_fragmentation_or_conversion", ("double", )),
         ("Transect_details", ("char", )),
         ("Coordinates_method", ("char", )), ("Longitude", ("double", )),
         ("Latitude", ("double", )),
         ("Country_distance_metres", ("double", )), ("Country", ("char", )),
         ("UN_subregion", ("char", )), ("UN_region", ("char", )),
         ("Ecoregion_distance_metres", ("double", )),
         ("Ecoregion", ("char", )), ("Biome", ("char", )),
         ("Realm", ("char", )), ("Hotspot", ("char", )),
         ("Wilderness_area", ("char", )), ("N_samples", ("double", )),
         ("Taxon_number", ("double", )), ("Taxon_name_entered", ("char", )),
         ("Indication", ("char", )), ("Parsed_name", ("char", )),
         ("Taxon", ("char", )), ("COL_ID", ("double", )),
         ("Name_status", ("char", )), ("Rank", ("char", )),
         ("Kingdom", ("char", )), ("Phylum", ("char", )),
         ("Class", ("char", )), ("Order", ("char", )),
         ("Family", ("char", )), ("Genus", ("char", )),
         ("Species", ("char", )), ("Best_guess_binomial", ("char", )),
         ("Higher_taxa", ("char", )), ("Higher_taxon", ("char", )),
         ("Measurement", ("double", )),
         ("Effort_corrected_measurement", ("double", ))
     ]
     engine.table = table
     if not os.path.isfile(engine.format_filename(filename)):
         engine.download_files_from_archive(self.urls["PREDICTS"],
                                            [filename], "zip", False,
                                            "download.zip")
     engine.create_table()
     engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #16

0

Afficher le fichier

Fichier : bbs50stop.py Projet : cotsog/deletedret

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=6)

            table.columns=[("species_id", ("pk-int",) ),
                           ("AOU", ("int",) ),
                           ("english_common_name", ("char",50) ),
                           ("french_common_name", ("char",50) ),
                           ("spanish_common_name", ("char",50) ),
                           ("sporder", ("char",30) ),
                           ("family", ("char",30) ),
                           ("genus", ("char",30) ),
                           ("species", ("char",50) ),
                           ]
            table.fixed_width = [7,6,51,51,51,51,51,51,50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            if not os.path.isfile(engine.format_filename("routes_new.csv")):
                engine.download_files_from_archive(self.urls["routes"],
                                                   ["routes.csv"])
                read = open(engine.format_filename("routes.csv"), "rb")
                write = open(engine.format_filename("routes_new.csv"), "wb")
                print "Cleaning routes data..."
                write.write(read.readline())
                for line in read:
                    values = line.split(',')
                    v = Decimal(values[5])
                    if  v > 0:
                        values[5] = str(v * Decimal("-1"))
                    write.write(','.join(str(value) for value in values))
                write.close()
                read.close()

            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes_new.csv")

            engine.insert_data_from_file(engine.format_filename("routes_new.csv"))


            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open(engine.format_filename("weather.csv"), "rb")
                write = open(engine.format_filename("weather_new.csv"), "wb")
                print "Cleaning weather data..."
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))


            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])
            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in replace.keys():
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns=[("RouteDataID"           ,   ("int",)        ),
                           ("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print "Inserting data from part " + part + "..."
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print "Failed bulk insert on " + part + ", inserting manually."
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print "There was an error in part " + part + "."
                    raise


        except zipfile.BadZipfile:
            print "There was an unexpected error in the Breeding Bird Survey archives."
            raise

        return engine

Exemple #17

0

Afficher le fichier

Fichier : wood_density.py Projet : surana-mudit/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"],
                                  "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow([
            "Number", "Family", "Binomial", "Wood_Density", "Region",
            "Reference_Number"
        ])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int", )), ("Family", ("char", )),
                         ("Binomial", ("char", )),
                         ("Wood_Density", ("double", )),
                         ("Region", ("char", )),
                         ("Reference_Number", ("int", ))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [
                to_str(column_value.value, object_encoding=sys.stdout)
                for column_value in row
            ]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int", )),
                         ("Reference", ("char", ))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine

Exemple #18

0

Afficher le fichier

Fichier : aquatic_animal_excretion.py Projet : henrykironde/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        filenames = ['Aquatic_animal_excretion_data.csv',
                     'Aquatic_animal_excretion_variable_descriptions.csv']
        for file_paths in filenames:
            if not os.path.isfile(engine.format_filename(file_paths)):
                url = self.urls["aquatic_animals"]
                engine.download_files_from_archive(url, filenames, "zip")

        # processing Aquatic_animal_excretion_data.csv
        filename = 'Aquatic_animal_excretion_data.csv'
        tablename = 'aquatic_animals'
        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("index", ("pk-int",)),
            ("sourcenumber", ("int",)),
            ("sourcename", ("char",)),
            ("speciesname", ("char",)),
            ("speciescode", ("char",)),
            ("invert/vert", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("trophicgild", ("char",)),
            ("drymass", ("double",)),
            ("logdrymass", ("double",)),
            ("ecosystemtype", ("char",)),
            ("energysource", ("char",)),
            ("habitat", ("char",)),
            ("residentecosystem", ("char",)),
            ("temperature", ("double",)),
            ("nexcretionrate", ("double",)),
            ("pexcretionrate", ("double",)),
            ("lognexcretionrate", ("double",)),
            ("logpexcretionrate", ("double",)),
            ("incubationtime", ("double",)),
            ("nform", ("char",)),
            ("pform", ("char",)),
            ("bodyc", ("double",)),
            ("bodyn", ("double",)),
            ("bodyp", ("double",)),
            ("bodyc:n", ("double",)),
            ("bodyc:p", ("double",)),
            ("bodyn:p", ("double",)),
            ("bodydatasource", ("char",)),
            ("datasource", ("char",)),
            ("dataproviders", ("char",))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

        # processing Aquatic_animal_excretion_variable_descriptions.csv
        filename = 'Aquatic_animal_excretion_variable_descriptions.csv'
        tablename = 'variable_descriptions'
        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("Column", ("char",)),
            ("Variable", ("char",)),
            ("Description", ("char",)),
            ("Data Class", ("char",)),
            ("Units", ("char",)),
            ("Minimum_value", ("char",)),
            ("Maximum_value", ("char",)),
            ("Possible_values", ("char",)),
            ("Missing_data_symbol", ("char",)),
            ("Notes", ("char",))]
        engine.table = table
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #19

0

Afficher le fichier

Fichier : gwdd.py Projet : cotsog/deletedret

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")

        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        #Creating data table
        lines = []
        for i in range(1, rows):
            row = sh.row(i)
            if not all(Excel.empty_cell(cell) for cell in row):
                this_line = {}
                def format_value(s):
                    s = Excel.cell_value(s)
                    return str(s).title().replace("\\", "/").replace('"', '')
                for num, label in enumerate(["Number", "Family", "Binomial", "Wood_Density",
                            "Region", "Reference_Number"]):
                    this_line[label] = format_value(row[num])
                lines.append(this_line)

        table = Table("data", delimiter="\t")
        table.columns=[("Number"                ,   ("pk-int",) ),
                       ("Family"                ,   ("char",)   ),
                       ("Binomial"              ,   ("char",)   ),
                       ("Wood_Density"          ,   ("double",) ),
                       ("Region"                ,   ("char",)   ),
                       ("Reference_Number"      ,   ("int",)    )]
        table.pk = 'Number'
        table.contains_pk = True

        gwdd = []
        for line in lines:
            gwdd_data = [line["Number"],
                         line["Family"],
                         line["Binomial"],
                         line["Wood_Density"],
                         line["Region"],
                         line["Reference_Number"]]
            gwdd.append(gwdd_data)

        data = ['\t'.join(gwdd_line) for gwdd_line in gwdd]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        #Creating reference table
        lines = []
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for i in range(1, rows):
            row = sh.row(i)
            if not all(Excel.empty_cell(cell) for cell in row):
                this_line = {}
                def format_value(s):
                    s = Excel.cell_value(s)
                    return str(s).title().replace("\\", "/").replace('"', '')
                for num, label in enumerate(["Reference_Number", "Reference"]):
                    this_line[label] = format_value(row[num])
                lines.append(this_line)

        table = Table("reference", delimiter="\t")
        table.columns=[("Reference_Number"  ,   ("pk-int",) ),
                       ("Reference"         ,   ("char",)   )]
        table.pk = 'Reference_Number'
        table.contains_pk = True

        gwdd = []
        for line in lines:
            gwdd_ref = [line["Reference_Number"],
                        line["Reference"]]
            gwdd.append(gwdd_ref)

        data = ['\t'.join(gwdd_line) for gwdd_line in gwdd]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        return self.engine

Exemple #20

0

Afficher le fichier

Fichier : breed_bird_survey_50stop.py Projet : ashishpriyadarshiCIC/retriever-recipes

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=11)
            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"],
                                               archive_name="routes.zip")
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            engine.download_files_from_archive(self.urls["weather"],
                                               ["weather.csv"],
                                               archive_name="weather.zip")
            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather.csv")
            engine.insert_data_from_file(engine.format_filename("weather.csv"))

            # Migrations data
            engine.download_files_from_archive(
                self.urls["migrants"], archive_name="MigrantNonBreeder.zip")
            engine.extract_zip(
                engine.format_filename("MigrantNonBreeder/Migrants.zip"),
                engine.format_filename("Migrant"),
            )
            engine.extract_zip(
                engine.format_filename("MigrantNonBreeder/MigrantSummary.zip"),
                engine.format_filename("MigrantSummary"),
            )

            table = Table("migrants", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('aou', ('int', )), ('stop1', ('int', )),
                             ('stop2', ('int', )), ('stop3', ('int', )),
                             ('stop4', ('int', )), ('stop5', ('int', )),
                             ('stop6', ('int', )), ('stop7', ('int', )),
                             ('stop8', ('int', )), ('stop9', ('int', )),
                             ('stop10', ('int', )), ('stop11', ('int', )),
                             ('stop12', ('int', )), ('stop13', ('int', )),
                             ('stop14', ('int', )), ('stop15', ('int', )),
                             ('stop16', ('int', )), ('stop17', ('int', )),
                             ('stop18', ('int', )), ('stop19', ('int', )),
                             ('stop20', ('int', )), ('stop21', ('int', )),
                             ('stop22', ('int', )), ('stop23', ('int', )),
                             ('stop24', ('int', )), ('stop25', ('int', )),
                             ('stop26', ('int', )), ('stop27', ('int', )),
                             ('stop28', ('int', )), ('stop29', ('int', )),
                             ('stop30', ('int', )), ('stop31', ('int', )),
                             ('stop32', ('int', )), ('stop33', ('int', )),
                             ('stop34', ('int', )), ('stop35', ('int', )),
                             ('stop36', ('int', )), ('stop37', ('int', )),
                             ('stop38', ('int', )), ('stop39', ('int', )),
                             ('stop40', ('int', )), ('stop41', ('int', )),
                             ('stop42', ('int', )), ('stop43', ('int', )),
                             ('stop44', ('int', )), ('stop45', ('int', )),
                             ('stop46', ('int', )), ('stop47', ('int', )),
                             ('stop48', ('int', )), ('stop49', ('int', )),
                             ('stop50', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_file(
                engine.format_filename("Migrant/Migrants.csv"))

            table = Table("migrantsummary", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('aou', ('int', )), ('count10', ('int', )),
                             ('count20', ('int', )), ('count30', ('int', )),
                             ('count40', ('int', )), ('count50', ('int', )),
                             ('stoptotal', ('int', )),
                             ('speciestotal', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.insert_data_from_file(
                engine.format_filename("MigrantSummary/MigrantSummary.csv"))

            table = Table("vehicledata", cleanup=Cleanup())
            table.columns = [('routedataid', ('int', )),
                             ('countrynum', ('int', )),
                             ('statenum', ('int', )), ('route', ('int', )),
                             ('rpid', ('int', )), ('year', ('int', )),
                             ('recordedcar', ('char', )), ('car1', ('int', )),
                             ('car2', ('int', )), ('car3', ('int', )),
                             ('car4', ('int', )), ('car5', ('int', )),
                             ('car6', ('int', )), ('car7', ('int', )),
                             ('car8', ('int', )), ('car9', ('int', )),
                             ('car10', ('int', )), ('car11', ('int', )),
                             ('car12', ('int', )), ('car13', ('int', )),
                             ('car14', ('int', )), ('car15', ('int', )),
                             ('car16', ('int', )), ('car17', ('int', )),
                             ('car18', ('int', )), ('car19', ('int', )),
                             ('car20', ('int', )), ('car21', ('int', )),
                             ('car22', ('int', )), ('car23', ('int', )),
                             ('car24', ('int', )), ('car25', ('int', )),
                             ('car26', ('int', )), ('car27', ('int', )),
                             ('car28', ('int', )), ('car29', ('int', )),
                             ('car30', ('int', )), ('car31', ('int', )),
                             ('car32', ('int', )), ('car33', ('int', )),
                             ('car34', ('int', )), ('car35', ('int', )),
                             ('car36', ('int', )), ('car37', ('int', )),
                             ('car38', ('int', )), ('car39', ('int', )),
                             ('car40', ('int', )), ('car41', ('int', )),
                             ('car42', ('int', )), ('car43', ('int', )),
                             ('car44', ('int', )), ('car45', ('int', )),
                             ('car46', ('int', )), ('car47', ('int', )),
                             ('car48', ('int', )), ('car49', ('int', )),
                             ('car50', ('int', )), ('noise1', ('int', )),
                             ('noise2', ('int', )), ('noise3', ('int', )),
                             ('noise4', ('int', )), ('noise5', ('int', )),
                             ('noise6', ('int', )), ('noise7', ('int', )),
                             ('noise8', ('int', )), ('noise9', ('int', )),
                             ('noise10', ('int', )), ('noise11', ('int', )),
                             ('noise12', ('int', )), ('noise13', ('int', )),
                             ('noise14', ('int', )), ('noise15', ('int', )),
                             ('noise16', ('int', )), ('noise17', ('int', )),
                             ('noise18', ('int', )), ('noise19', ('int', )),
                             ('noise20', ('int', )), ('noise21', ('int', )),
                             ('noise22', ('int', )), ('noise23', ('int', )),
                             ('noise24', ('int', )), ('noise25', ('int', )),
                             ('noise26', ('int', )), ('noise27', ('int', )),
                             ('noise28', ('int', )), ('noise29', ('int', )),
                             ('noise30', ('int', )), ('noise31', ('int', )),
                             ('noise32', ('int', )), ('noise33', ('int', )),
                             ('noise34', ('int', )), ('noise35', ('int', )),
                             ('noise36', ('int', )), ('noise37', ('int', )),
                             ('noise38', ('int', )), ('noise39', ('int', )),
                             ('noise40', ('int', )), ('noise41', ('int', )),
                             ('noise42', ('int', )), ('noise43', ('int', )),
                             ('noise44', ('int', )), ('noise45', ('int', )),
                             ('noise46', ('int', )), ('noise47', ('int', )),
                             ('noise48', ('int', )), ('noise49', ('int', )),
                             ('noise50', ('int', ))]
            engine.table = table
            engine.create_table()
            engine.download_files_from_archive(self.urls["Vehicledata"],
                                               archive_name="VehicleData.zip")
            engine.extract_zip(
                engine.format_filename("VehicleData/VehicleData.zip"),
                engine.format_filename("VehicleData"),
            )
            engine.insert_data_from_file(
                engine.format_filename("VehicleData/VehicleData.csv"))

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            engine.download_files_from_archive(self.urls["counts"],
                                               archive_name="50-StopData.zip")
            table.columns = [("RouteDataID", ("int", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("year", ("int", )),
                             ("AOU", ("int", )), ("Stop1", ("int", )),
                             ("Stop2", ("int", )), ("Stop3", ("int", )),
                             ("Stop4", ("int", )), ("Stop5", ("int", )),
                             ("Stop6", ("int", )), ("Stop7", ("int", )),
                             ("Stop8", ("int", )), ("Stop9", ("int", )),
                             ("Stop10", ("int", )), ("Stop11", ("int", )),
                             ("Stop12", ("int", )), ("Stop13", ("int", )),
                             ("Stop14", ("int", )), ("Stop15", ("int", )),
                             ("Stop16", ("int", )), ("Stop17", ("int", )),
                             ("Stop18", ("int", )), ("Stop19", ("int", )),
                             ("Stop20", ("int", )), ("Stop21", ("int", )),
                             ("Stop22", ("int", )), ("Stop23", ("int", )),
                             ("Stop24", ("int", )), ("Stop25", ("int", )),
                             ("Stop26", ("int", )), ("Stop27", ("int", )),
                             ("Stop28", ("int", )), ("Stop29", ("int", )),
                             ("Stop30", ("int", )), ("Stop31", ("int", )),
                             ("Stop32", ("int", )), ("Stop33", ("int", )),
                             ("Stop34", ("int", )), ("Stop35", ("int", )),
                             ("Stop36", ("int", )), ("Stop37", ("int", )),
                             ("Stop38", ("int", )), ("Stop39", ("int", )),
                             ("Stop40", ("int", )), ("Stop41", ("int", )),
                             ("Stop42", ("int", )), ("Stop43", ("int", )),
                             ("Stop44", ("int", )), ("Stop45", ("int", )),
                             ("Stop46", ("int", )), ("Stop47", ("int", )),
                             ("Stop48", ("int", )), ("Stop49", ("int", )),
                             ("Stop50", ("int", ))]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1, 11):
                part = str(part)
                try:
                    print("Inserting data from part " + part + "...")
                    try:
                        "1997ToPresent_SurveyWide"
                        engine.table.cleanup = Cleanup()
                        engine.extract_zip(
                            engine.format_filename(
                                "50-StopData/1997ToPresent_SurveyWide/Fifty" +
                                part + ".zip"),
                            engine.format_filename("fifty" + part + ".csv"),
                        )
                    except:
                        print(
                            "fifty{}: Failed bulk insert on, inserting manually."
                            .format(part))
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + "Fifty" + part + ".zip",
                            ["fifty" + part + ".csv"])

                except:
                    print("There was an error in part " + part + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine

Exemple #21

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # Complete Plants Checklist
        file_name = "complete_plant_checklist.csv"
        table_name = "complete_plant_checklist"
        complete_plant_url = "https://plants.sc.egov.usda.gov/java/downloadData?fileName=plantlst.txt&static=true"
        self.engine.download_file(complete_plant_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("common_name", ("char", "42")),
            ("family", ("char", "30")),
        ]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # Symbols for Unknown Plants
        file_name = "symbols_unknown_plants.csv"
        table_name = "unknown_plants"
        unknown_plants_url = "https://plants.sc.egov.usda.gov/Data/unknown_plants.txt"
        self.engine.download_file(unknown_plants_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [("symbol", ("char", "7")),
                         ("common_name", ("char", "56"))]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # State PLANTS Checklist
        base_url = "https://plants.sc.egov.usda.gov/"
        state_plant_checklist_base_url = "{base}java/stateDownload?statefips={id}"
        state_plant_checklist_file = "all_state_plant_checklist.csv"
        table_name = "state_plant_checklist"
        state_plant_checklist = [
            ("US01", "Alabama", "US"),
            ("US02", "Alaska", "US"),
            ("US05", "Arkansas", "US"),
            ("US04", "Arizona", "US"),
            ("US06", "California", "US"),
            ("US08", "Colorado", "US"),
            ("US09", "Connecticut", "US"),
            ("US10", "Delaware", "US"),
            ("US11", "District of Columbia", "US"),
            ("US12", "Florida", "US"),
            ("US13", "Georgia", "US"),
            ("US15", "Hawaii", "US"),
            ("US16", "Idaho", "US"),
            ("US17", "Illinois", "US"),
            ("US18", "Indiana", "US"),
            ("US19", "Iowa", "US"),
            ("US20", "Kansas", "US"),
            ("US21", "Kentucky", "US"),
            ("US22", "Louisiana", "US"),
            ("US23", "Maine", "US"),
            ("US24", "Maryland", "US"),
            ("US25", "Massachusetts", "US"),
            ("US26", "Michigan", "US"),
            ("US27", "Minnesota", "US"),
            ("US28", "Mississippi", "US"),
            ("US29", "Missouri", "US"),
            ("US30", "Montana", "US"),
            ("US31", "Nebraska", "US"),
            ("US32", "Nevada", "US"),
            ("US33", "New Hampshire", "US"),
            ("US34", "New Jersey", "US"),
            ("US35", "New Mexico", "US"),
            ("US36", "New York", "US"),
            ("US37", "North Carolina", "US"),
            ("US38", "North Dakota", "US"),
            ("US39", "Ohio", "US"),
            ("US40", "Oklahoma", "US"),
            ("US41", "Oregon", "US"),
            ("US42", "Pennsylvania", "US"),
            ("US44", "Rhode Island", "US"),
            ("US45", "South Carolina", "US"),
            ("US46", "South Dakota", "US"),
            ("US47", "Tennessee", "US"),
            ("US48", "Texas", "US"),
            ("US49", "Utah", "US"),
            ("US50", "Vermont", "US"),
            ("US51", "Virginia", "US"),
            ("US53", "Washington", "US"),
            ("US54", "West Virginia", "US"),
            ("US55", "Wisconsin", "US"),
            ("US56", "Wyoming", "US"),
            ("US72", "Puerto Rico", "US"),
            ("US78", "Virgin Islands", "US"),
            ("CA01", "Alberta", "Canada"),
            ("CA02", "British Columbia", "Canada"),
            ("CA03", "Manitoba", "Canada"),
            ("CA04", "New Brunswick", "Canada"),
            ("CALB", "Labrador", "Canada"),
            ("CANF", "Newfoundland", "Canada"),
            ("CA13", "Northwest Territories", "Canada"),
            ("CA07", "Nova Scotia", "Canada"),
            ("CA14", "Nunavut", "Canada"),
            ("CA08", "Ontario", "Canada"),
            ("CA09", "Prince Edward Island", "Canada"),
            ("CA10", "Québec", "Canada"),
            ("CA11", "Saskatchewan", "Canada"),
            ("CA12", "Yukon", "Canada"),
            ("GL", "Greenland", "Denmark"),
            ("SB", "St. Pierre and Miquelon", "France"),
        ]

        with open_fw(engine.format_filename(
                state_plant_checklist_file)) as write_object:
            csv_writer = open_csvw(write_object)
            for state_info in state_plant_checklist:
                file_name = state_info[1].replace(".", "").replace(
                    " ", "_").lower() + ".csv"
                file_name = "old_state_plant_checklist_" + file_name
                state_url = state_plant_checklist_base_url.format(
                    base=base_url, id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    for row in csv.reader(read_object, delimiter=","):
                        csv_writer.writerow([state_info[2]] + [state_info[1]] +
                                            row)

        data_path = self.engine.format_filename(state_plant_checklist_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("country", ("char", "7")),
            ("state", ("char", "23")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("national_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table,
                                      filename=state_plant_checklist_file)
        self.engine.insert_data_from_file(data_path)

        # NRCS State GSAT Lists
        base_url = "https://www.plants.usda.gov/"
        nrcs_state_gsat_base_url = "{base}java/gsatDownload?gsatid={id}"
        nrcs_state_gsat_file = "all_nrcs_state_gsat.csv"
        table_name = "nrcs_state_gsat"
        nrcs_state_gsat = [
            ("Alabama", "2"),
            ("Alaska", ""),
            ("Arkansas", ""),
            ("Arizona", "2"),
            ("California", ""),
            ("Colorado", ""),
            ("Connecticut", ""),
            ("Delaware", ""),
            ("Florida", ""),
            ("Georgia", ""),
            ("Hawaii", ""),
            ("Idaho", "9"),
            ("Illinois", ""),
            ("Indiana", ""),
            ("Iowa ", ""),
            ("Kansas", "6"),
            ("Kentucky", ""),
            ("Louisiana", "16"),
            ("Maine", ""),
            ("Maryland", ""),
            ("Massachusetts", ""),
            ("Michigan", ""),
            ("Minnesota", "11"),
            ("Mississippi", ""),
            ("Missouri", "14"),
            ("Montana", ""),
            ("Nebraska", "17"),
            ("Nevada", "4"),
            ("New Hampshire", ""),
            ("New Jersey ", ""),
            ("New Mexico", "1"),
            ("New York", ""),
            ("Noth Carolina", ""),
            ("North Dakota", "5"),
            ("Ohio", ""),
            ("Oklahoma", "12"),
            ("Oregon", "3"),
            ("Pennsylvania", "15"),
            ("Rhode Island", ""),
            ("South Carolina", ""),
            ("South Dakota", "7"),
            ("Tennessee", ""),
            ("Texas", "13"),
            ("Utah", ""),
            ("Vermont ", ""),
            ("Virginia", ""),
            ("Washington", "8"),
            ("West Virginia", ""),
            ("Wisconsin", ""),
            ("Wyoming", "10"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_gsat_file)) as write_object:
            for state_info in nrcs_state_gsat:
                if state_info[1]:
                    # skip states with no data ("state", ""),
                    file_name = state_info[0].replace(" ", "_").replace(
                        ".", "").lower() + ".csv"
                    file_name = "old_nrcs_state_gsat_" + file_name
                    state_url = nrcs_state_gsat_base_url.format(
                        base=base_url, id=state_info[1])
                    self.engine.download_file(state_url, filename=file_name)
                    with open_fr(
                            engine.format_filename(file_name)) as read_object:
                        # Read state file and only write the data minus header
                        next(read_object)
                        state_quoted = '"{state}",'.format(state=state_info[0])
                        for line in read_object:
                            write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_gsat_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "12")),
            ("symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("gsat_common_name", ("char", "93")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_gsat_file)
        self.engine.insert_data_from_file(data_path)

        base_url = "https://plants.sc.egov.usda.gov/"
        nrcs_state_plant_lists_url = "{base}java/nrcsStateDownload?statefips={id}"
        nrcs_state_plant_file = "all_nrcs_state_plant.csv"
        table_name = "nrcs_state_plant"
        nrcs_state_plant_lists = [
            ("01", "Alabama"),
            ("02", "Alaska"),
            ("05", "Arkansas"),
            ("04", "Arizona"),
            ("06", "California"),
            ("08", "Colorado"),
            ("09", "Connecticut"),
            ("10", "Delaware"),
            ("12", "Florida"),
            ("13", "Georgia"),
            ("15", "Hawaii"),
            ("16", "Idaho"),
            ("17", "Illinois"),
            ("18", "Indiana"),
            ("19", "Iowa"),
            ("20", "Kansas"),
            ("21", "Kentucky"),
            ("22", "Louisiana"),
            ("23", "Maine"),
            ("24", "Maryland"),
            ("25", "Massachusetts"),
            ("26", "Michigan"),
            ("27", "Minnesota"),
            ("28", "Mississippi"),
            ("29", "Missouri"),
            ("30", "Montana"),
            ("31", "Nebraska"),
            ("32", "Nevada"),
            ("33", "New Hampshire"),
            ("34", "New Jersey"),
            ("35", "New Mexico"),
            ("36", "New York"),
            ("37", "North Carolina"),
            ("38", "North Dakota"),
            ("39", "Ohio"),
            ("40", "Oklahoma"),
            ("41", "Oregon"),
            ("42", "Pennsylvania"),
            ("44", "Rhode Island"),
            ("45", "South Carolina"),
            ("46", "South Dakota"),
            ("47", "Tennessee"),
            ("48", "Texas"),
            ("49", "Utah"),
            ("50", "Vermont"),
            ("51", "Virginia"),
            ("53", "Washington"),
            ("54", "West Virginia"),
            ("55", "Wisconsin"),
            ("56", "Wyoming"),
            ("72", "Puerto Rico"),
            ("78", "Virgin Islands"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_plant_file)) as write_object:
            for state_info in nrcs_state_plant_lists:
                file_name = state_info[1].replace(" ", "_").replace(
                    ".", "").lower() + ".csv"
                file_name = "old_nrcs_state_plant_" + file_name
                state_url = nrcs_state_plant_lists_url.format(base=base_url,
                                                              id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    state_quoted = '"{state}",'.format(state=state_info[1])
                    for line in read_object:
                        write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_plant_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "17")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("state_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_plant_file)
        self.engine.insert_data_from_file(data_path)

Exemple #22

0

Afficher le fichier

Fichier : fao_global_capture_product.py Projet : shubhank-saxena/retriever-recipes

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        engine.download_files_from_archive(self.urls["capture"], archive_type="zip")

        # Convert xlsx to csv.
        xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx")
        file_path = self.engine.format_filename("DSD_CAPTURE.csv")
        book = xlrd.open_workbook(xlsx_file)
        sh = book.sheet_by_index(0)
        rows = sh.nrows

        # Creating data files
        new_data = open_fw(file_path)
        csv_writer = open_csvw(new_data)
        csv_writer.writerow(["Order", "Concept_id",
                             "Role_Type", "Codelist_id",
                             "Codelist_Code_id", "Description"])

        for index in range(2, rows):
            row = sh.row(index)
            # Get each row and format the sell value.
            # Data starts at index 2
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        new_data.close()

        file_names = [
            ('CL_FI_UNIT.csv', 'unit_data'),
            ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'),
            ('DSD_CAPTURE.csv', 'dsd_capture_data'),
            ('CL_FI_SPECIES_GROUPS.csv', 'species_group')
        ]

        for (filename, tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

        # File CL_FI_COUNTRY_GROUPS.csv has multi encoding
        file_names_encoded = [
            ('CL_FI_COUNTRY_GROUPS.csv', 'country_groups'),
        ]
        for (filename, tablename) in file_names_encoded:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            table.columns = [('UN_Code', ('int', )),
                             ('Identifier', ('int', )),
                             ('ISO2_Code', ('char', '5')),
                             ('ISO3_Code', ('char', '5')),
                             ('Name_En', ('char', '50')),
                             ('Name_Fr', ('char', '50')),
                             ('Name_Es', ('char', '50')),
                             ('Name_Ar', ('char', '120')),
                             ('Name_Cn', ('char', '90')),
                             ('Name_Ru', ('char', '150')),
                             ('Official_Name_En', ('char', '70')),
                             ('Official_Name_Fr', ('char', '70')),
                             ('Official_Name_Es', ('char', '70')),
                             ('Official_Name_Ar', ('char', '1100')),
                             ('Official_Name_Cn', ('char', '70')),
                             ('Official_Name_Ru', ('char', '130')),
                             ('Continent_Group', ('char', '15')),
                             ('EcoClass_Group', ('char', '50')),
                             ('GeoRegion_Group', ('char', '30'))]
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

            # TS_FI_CAPTURE is
            file_names_encoded = [
                ('TS_FI_CAPTURE.csv', 'ts_capture_data',)
            ]
            for (filename, tablename) in file_names_encoded:
                data_path = self.engine.format_filename(filename)
                table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
                table.columns = [('COUNTRY', ('int', )),
                                 ('FISHING_AREA', ('int', )),
                                 ('SPECIES', ('char', '10')),
                                 ('YEAR', ('int', )),
                                 ('UNIT', ('char', '5')),
                                 ('QUANTITY', ('double', )),
                                 ('SYMBOL', ('char', '4'))]
                self.engine.auto_create_table(table, filename=filename)
                self.engine.insert_data_from_file(data_path)

Exemple #23

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine
        engine.download_files_from_archive(self.urls["data"], ["Data_Files/Amniote_Database_Aug_2015.csv",
                                                               "Data_Files/Amniote_Database_References_Aug_2015.csv",
                                                               "Data_Files/Amniote_Range_Count_Aug_2015.csv"],
                                           filetype="zip")

        ct_column = 'trait'  # all tables use the same ct_column name

        # Create tables from Amniote_Database_Aug.csv and Amniote_Database_References_Aug_2015.csv
        # Both reference and main have the same headers

        ct_names = ['female_maturity_d', 'litter_or_clutch_size_n', 'litters_or_clutches_per_y', 'adult_body_mass_g',
                    'maximum_longevity_y', 'gestation_d', 'weaning_d', 'birth_or_hatching_weight_g', 'weaning_weight_g',
                    'egg_mass_g', 'incubation_d', 'fledging_age_d', 'longevity_y', 'male_maturity_d',
                    'inter_litter_or_interbirth_interval_y', 'female_body_mass_g', 'male_body_mass_g',
                    'no_sex_body_mass_g', 'egg_width_mm', 'egg_length_mm', 'fledging_mass_g', 'adult_svl_cm',
                    'male_svl_cm', 'female_svl_cm', 'birth_or_hatching_svl_cm', 'female_svl_at_maturity_cm',
                    'female_body_mass_at_maturity_g', 'no_sex_svl_cm', 'no_sex_maturity_d']

        # Create table main from Amniote_Database_Aug_2015.csv

        columns = [
            ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')),
            ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')),
            ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))]
        table_main = Table('main', delimiter=',', cleanup=self.cleanup_func_table)
        table_main.ct_column = ct_column
        table_main.ct_names = ct_names
        table_main.columns = columns
        engine.auto_create_table(table_main,
                                 filename="Amniote_Database_Aug_2015.csv")
        engine.insert_data_from_file(engine.format_filename("Amniote_Database_Aug_2015.csv"))

        # Create table reference from Amniote_Database_References_Aug_2015.csv
        reference_columns = [
            ('record_id', ('pk-auto',)), ('class', ('char', '20')), ('order', ('char', '20')),
            ('family', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')),
            ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('reference', ('ct-char',))]

        table_references = Table('references', delimiter=',', cleanup=self.cleanup_func_table)
        table_references.ct_column = ct_column
        table_references.ct_names = ct_names
        table_references.columns = reference_columns
        engine.auto_create_table(table_references,
                                 filename="Amniote_Database_References_Aug_2015.csv")
        engine.insert_data_from_file(engine.format_filename("Amniote_Database_References_Aug_2015.csv"))

        # Create table Range
        # This table has different values for headers from the above tables.
        range_ct_names = ["min_female_maturity", "max_female_maturity", "count_female_maturity", "min_litter_clutch_size",
                    "max_litter_clutch_size", "count_litter_clutch_size", "min_litters_clutches",
                    "max_litters_clutches", "count_litters_clutches", "min_adult_body_mass", "max_adult_body_mass",
                    "count_adult_body_mass", "min_maximum_longevity", "max_maximum_longevity",
                    "count_maximum_longevity", "min_gestation", "max_gestation", "count_gestation", "min_weaning",
                    "max_weaning", "count_weaning", "min_birth_hatching_weight", "max_birth_hatching_weight",
                    "count_birth_hatching_weight", "min_weaning_weight", "max_weaning_weight", "count_weaning_weight",
                    "min_egg_mass", "max_egg_mass", "count_egg_mass", "min_incubation", "max_incubation",
                    "count_incubation", "min_fledging_age", "max_fledging_age", "count_fledging_age",
                    "min_male_maturity", "max_male_maturity", "count_male_maturity",
                    "min_inter_litter_interbirth_interval", "max_inter_litter_interbirth_interval",
                    "count_inter_litter_interbirth_interval", "min_female_body_mass", "max_female_body_mass",
                    "count_female_body_mass", "min_male_body_mass", "max_male_body_mass", "count_male_body_mass",
                    "min_no_sex_body_mass", "max_no_sex_body_mass", "count_no_sex_body_mass", "min_egg_width",
                    "max_egg_width", "count_egg_width", "min_egg_length", "max_egg_length", "count_egg_length",
                    "min_fledging_mass", "max_fledging_mass", "count_fledging_mass", "min_adult_svl", "max_adult_svl",
                    "count_adult_svl", "min_male_svl", "max_male_svl", "count_male_svl", "min_female_svl",
                    "max_female_svl", "count_female_svl", "min_hatching_svl", "max_hatching_svl", "count_hatching_svl",
                    "min_female_svl_at_maturity", "max_female_svl_at_maturity", "count_female_svl_at_maturity",
                    "min_female_body_mass_at_maturity", "max_female_body_mass_at_maturity",
                    "count_female_body_mass_at_maturity", "min_no_sex_svl", "max_no_sex_svl", "count_no_sex_svl",
                    "min_no_sex_maturity", "max_no_sex_maturity", "count_no_sex_maturity"]
        range_columns = [
            ('record_id', ('pk-auto',)), ('classx', ('char', '20')), ('orderx', ('char', '20')),
            ('familyx', ('char', '20')), ('genus', ('char', '20')), ('species', ('char', '50')),
            ('subspecies', ('char', '20')), ('common_name', ('char', '400')), ('trait_value', ('ct-double',))]

        table_range = Table('range', delimiter=',', cleanup=self.cleanup_func_table)
        table_range.ct_column = ct_column
        table_range.ct_names = range_ct_names
        table_range.columns = range_columns
        engine.auto_create_table(table_range,
                                 filename="Amniote_Range_Count_Aug_2015.csv")
        engine.insert_data_from_file(engine.format_filename("Amniote_Range_Count_Aug_2015.csv"))

Exemple #24

0

Afficher le fichier

Fichier : npn.py Projet : KristinaRiemer/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = ["observation_id",
                         "update_datetime",
                         "site_id",
                         "latitude",
                         "longitude",
                         "elevation_in_meters",
                         "state",
                         "species_id",
                         "genus",
                         "species",
                         "common_name",
                         "kingdom",
                         "individual_id",
                         "phenophase_id",
                         "phenophase_description",
                         "observation_date",
                         "day_of_year",
                         "phenophase_status",
                         "intensity_category_id",
                         "intensity_value",
                         "abundance_value"
                         ]

        columns = [("record_id", ("pk-auto",)),
                   ("observation_id", ("int",)),  # subsequently refered to as "status record"
                   ("update_datetime", ("char",)),
                   ("site_id", ("int",)),
                   ("latitude", ("double",)),
                   ("longitude", ("double",)),
                   ("elevation_in_meters", ("char",)),
                   ("state", ("char",)),
                   ("species_id", ("int",)),
                   ("genus", ("char",)),
                   ("species", ("char",)),
                   ("common_name", ("char",)),
                   ("kingdom", ("char",)),  # skip kingdom
                   ("individual_id", ("char",)),
                   ("phenophase_id", ("int",)),
                   ("phenophase_description", ("char",)),
                   ("observation_date", ("char",)),
                   ("day_of_year", ("char",)),
                   ("phenophase_status", ("char",)),
                   ("intensity_category_id", ("char",)),
                   ("intensity_value", ("char",)),
                   ("abundance_value", ("char",))
                   ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine

Exemple #25

0

Afficher le fichier

Fichier : npn.py Projet : rupesh1798/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = [
            "observation_id", "update_datetime", "site_id", "latitude",
            "longitude", "elevation_in_meters", "state", "species_id", "genus",
            "species", "common_name", "kingdom", "individual_id",
            "phenophase_id", "phenophase_description", "observation_date",
            "day_of_year", "phenophase_status", "intensity_category_id",
            "intensity_value", "abundance_value"
        ]

        columns = [
            ("record_id", ("pk-auto", )),
            ("observation_id",
             ("int", )),  # subsequently refered to as "status record"
            ("update_datetime", ("char", )),
            ("site_id", ("int", )),
            ("latitude", ("double", )),
            ("longitude", ("double", )),
            ("elevation_in_meters", ("char", )),
            ("state", ("char", )),
            ("species_id", ("int", )),
            ("genus", ("char", )),
            ("species", ("char", )),
            ("common_name", ("char", )),
            ("kingdom", ("char", )),  # skip kingdom
            ("individual_id", ("char", )),
            ("phenophase_id", ("int", )),
            ("phenophase_description", ("char", )),
            ("observation_date", ("char", )),
            ("day_of_year", ("char", )),
            ("phenophase_status", ("char", )),
            ("intensity_category_id", ("char", )),
            ("intensity_value", ("char", )),
            ("abundance_value", ("char", ))
        ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date),
                                           endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date),
                                           endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(),
                                 key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations',
                      delimiter=',',
                      pk='record_id',
                      contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine

Exemple #26

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.auto_create_table(Table("sites"),
                                      url=self.urls["sites"],
                                      filename='gentry_sites.csv')
        self.engine.insert_data_from_url(self.urls["sites"])

        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(
            self.engine.format_filename("all_Excel.zip"))
        filelist = local_zip.namelist()
        local_zip.close()
        self.engine.download_files_from_archive(self.urls["stems"], filelist)

        filelist = [os.path.basename(filename) for filename in filelist]

        # Currently all_Excel.zip is missing CURUYUQU.xls
        # Download it separately and add it to the file list
        if not self.engine.find_file('CURUYUQU.xls'):
            self.engine.download_file(
                "http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls",
                "CURUYUQU.xls")
            filelist.append('CURUYUQU.xls')

        lines = []
        tax = []
        for filename in filelist:
            print("Extracting data from " + filename + "...")
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for colnum, c in enumerate(sh.row(0)):
                if not Excel.empty_cell(c):
                    cid = c.value.lower().strip()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # in QUIAPACA.xls the "number of individuals" column is
                    # misnamed "STEMDBH" just like the stems columns, so weep
                    # for the state of scientific data and then fix manually
                    if filename == "QUIAPACA.xls" and colnum == 13:
                        cid = "count"

                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid or "dbh" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in list(cn.keys()):
                cn["liana"] = -1
            if not "count" in list(cn.keys()):
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if not all(Excel.empty_cell(cell) for cell in row):
                    try:
                        this_line = {}

                        # get the following information from the appropriate columns
                        for i in [
                                "line", "family", "genus", "species", "liana",
                                "count"
                        ]:
                            if cn[i] > -1:
                                if row[cn[i]].ctype != 2:
                                    # if the cell type(ctype) is not a number
                                    this_line[i] = row[
                                        cn[i]].value.lower().strip().replace(
                                            "\\", "/").replace('"', '')
                                else:
                                    this_line[i] = row[cn[i]].value
                                if this_line[i] == '`':
                                    this_line[i] = 1
                        this_line["stems"] = [
                            row[c] for c in cn["stems"]
                            if not Excel.empty_cell(row[c])
                        ]
                        this_line["site"] = filename[0:-4]

                        # Manually correct CEDRAL data, which has a single line
                        # that is shifted by one to the left starting at Liana
                        if this_line["site"] == "CEDRAL" and type(
                                this_line["liana"]) == float:
                            this_line["liana"] = ""
                            this_line["count"] = 3
                            this_line["stems"] = [2.5, 2.5, 30, 18, 25]

                        lines.append(this_line)

                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append(
                            (this_line["family"], this_line["genus"],
                             this_line["species"], id_level, str(full_id)))
                    except:
                        raise
                        pass

        tax = sorted(
            tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = {}
        tax_count = 0

        # Get all unique families/genera/species
        print("\n")
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(
                        tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.flush()
                    sys.stdout.write(msg + "\b" * len(msg))
        print("\n")
        # Create species table
        table = Table("species", delimiter=",")
        table.columns = [("species_id", ("pk-int", )), ("family", ("char", )),
                         ("genus", ("char", )), ("species", ("char", )),
                         ("id_level", ("char", 10)), ("full_id", ("int", ))]

        data = [[str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)

        # Create stems table
        table = Table("stems", delimiter=",")
        table.columns = [("stem_id", ("pk-auto", )), ("line", ("int", )),
                         ("species_id", ("int", )),
                         ("site_code", ("char", 12)), ("liana", ("char", 10)),
                         ("stem", ("double", ))]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [
                line["line"],
                tax_dict[(line["family"], line["genus"], line["species"])],
                line["site"], liana
            ]
            try:
                counts.append(
                    [value for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [str(i)]
                stems.append(stem)

        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(stems)

        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns = [("count_id", ("pk-auto", )), ("line", ("int", )),
                         ("species_id", ("int", )),
                         ("site_code", ("char", 12)), ("liana", ("char", 10)),
                         ("count", ("double", ))]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(counts)

        return self.engine

Exemple #27

0

Afficher le fichier

Fichier : wood_density.py Projet : dmcglinn/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int",)),
                         ("Family", ("char",)),
                         ("Binomial", ("char",)),
                         ("Wood_Density", ("double",)),
                         ("Region", ("char",)),
                         ("Reference_Number", ("int",))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine

Exemple #28

0

Afficher le fichier

Fichier : usda_mafcl_standard_reference.py Projet : shubhank-saxena/retriever-recipes

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # Download both full and abbreviated versions and extract the data files
        abbrev_version = ["ABBREV.txt"]
        full_version = [
            "DERIV_CD.txt", "FOOTNOTE.txt", "NUTR_DEF.txt", "WEIGHT.txt",
            "DATA_SRC.txt", "FD_GROUP.txt", "LANGDESC.txt", "NUT_DATA.txt",
            "DATSRCLN.txt", "FOOD_DES.txt", "LANGUAL.txt", "SRC_CD.txt"
        ]

        self.engine.download_files_from_archive(self.urls["full_version"],
                                                archive_type="zip",
                                                file_names=full_version)
        self.engine.download_files_from_archive(
            self.urls["abbreviated_version"],
            archive_type="zip",
            file_names=abbrev_version,
        )

        # Convert original txt to csv
        convert_to_csv(self.engine.format_data_dir())

        # FOOD_DES table
        new_file_name = "food_des.csv"
        table = Table("food_des", delimiter=",", header_rows=0)
        table.columns = [
            ("ndb_no", ("int", )),
            ("fdgrp_cd", ("int", )),
            ("long_desc", ("char", "205")),
            ("shrt_desc", ("char", "65")),
            ("comname", ("char", "105")),
            ("manufacname", ("char", "70")),
            ("survey", ("char", "1")),
            ("ref_desc", ("char", "140")),
            ("refuse", ("double", )),
            ("sciname", ("char", "67")),
            ("n_factor", ("double", )),
            ("pro_factor", ("double", )),
            ("fat_factor", ("double", )),
            ("cho_factor", ("double", )),
        ]
        self.create_and_install(new_file_name, table)

        # FdGrp_Cd table
        new_file_name = "fd_group.csv"
        table = Table("fd_group", delimiter=",", header_rows=0)
        table.columns = [("fdgrp_cd", ("int", )),
                         ("fdgrp_desc", ("char", "65"))]
        self.create_and_install(new_file_name, table)

        # LANGUAL table
        new_file_name = "langual.csv"
        table = Table("langual", delimiter=",", header_rows=0)
        table.columns = [("ndb_no", ("int", )), ("factor_code", ("char", "5"))]
        self.create_and_install(new_file_name, table)

        # LANGDESC Table
        new_file_name = "langdesc.csv"
        table = Table("langdesc", delimiter=",", header_rows=0)
        table.columns = [
            ("factor_code", ("char", "5")),
            ("description", ("char", "145")),
        ]
        self.create_and_install(new_file_name, table)

        # NUT_DATA table
        new_file_name = "nut_data.csv"
        missingValues = [
            "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9",
            "Unnamed: 10", "Unnamed: 11", "Unnamed: 12", "Unnamed: 13",
            "Unnamed: 14", "Unnamed: 15", "Unnamed: 17"
        ]
        table = Table(
            "nut_data",
            delimiter=",",
            header_rows=0,
            missingValues=missingValues,
            do_not_bulk_insert=True,
        )
        table.columns = [
            ("ndb_no", ("int", )),
            ("nutr_no", ("int", )),
            ("nutr_val", ("double", )),
            ("num_data_pts", ("int", )),
            ("std_error", ("double", )),
            ("src_cd", ("int", )),
            ("deriv_cd", ("char", "12")),
            ("ref_ndb_no", ("double", )),
            ("add_nutr_mark", ("char", "12")),
            ("num_studies", ("double", )),
            ("min", ("double", )),
            ("max", ("double", )),
            ("df", ("double", )),
            ("low_eb", ("double", )),
            ("up_eb", ("double", )),
            ("stat_cmt", ("char", "12")),
            ("addmod_date", ("char", "12")),
            ("cc", ("char", "12")),
        ]
        self.create_and_install(new_file_name, table)

        # NUTR_DEF table
        new_file_name = "nutr_def.csv"
        table = Table("nutr_def", delimiter=",", header_rows=0)
        table.columns = [
            ("nutr_no", ("int", )),
            ("units", ("char", "10")),
            ("tagname", ("char", "25")),
            ("nutrdesc", ("char", "60")),
            ("num_dec", ("int", )),
            ("sr_order", ("int", )),
        ]
        self.create_and_install(new_file_name, table)

        # SRC_CD table
        new_file_name = "src_cd.csv"
        table = Table("src_cd", delimiter=",", header_rows=0)
        table.columns = [("src_cd", ("int", )), ("srccd_desc", ("char", "65"))]
        self.create_and_install(new_file_name, table)

        # DERIV_CD table
        new_file_name = "deriv_cd.csv"
        table = Table("deriv_cd", delimiter=",", header_rows=0)
        table.columns = [("deriv_cd", ("char", "5")),
                         ("deriv_desc", ("char", "130"))]
        self.create_and_install(new_file_name, table)

        # WEIGHT table
        new_file_name = "weight.csv"
        table = Table(
            "weight",
            delimiter=",",
            header_rows=0,
            missingValues=["Unnamed: 5", "Unnamed: 6"],
        )
        table.columns = [
            ("ndb_no", ("int", )),
            ("seq", ("int", )),
            ("amount", ("double", )),
            ("msre_desc", ("char", "130")),
            ("gm_wgt", ("double", )),
            ("num_data_pts", ("double", )),
            ("std_dev", ("double", )),
        ]
        self.create_and_install(new_file_name, table)

        # FOOTNOTE table
        new_file_name = "footnote.csv"
        table = Table("footnote",
                      delimiter=",",
                      header_rows=0,
                      missingValues=["Unnamed: 3"])
        table.columns = [
            ("ndb_no", ("int", )),
            ("footnt_no", ("int", )),
            ("footnt_typ", ("char", "2")),
            ("nutr_no", ("double", )),
            ("footnt_txt", ("char", "200")),
        ]
        self.create_and_install(new_file_name, table)

        # DATSRCLN table
        new_file_name = "datsrcln.csv"
        table = Table("datsrcln", delimiter=",", header_rows=0)
        table.columns = [
            ("ndb_no", ("int", )),
            ("nutr_no", ("int", )),
            ("datasrc_id", ("char", "7")),
        ]

        self.create_and_install(new_file_name, table)

        # DATA_SRC table
        new_file_name = "data_src.csv"
        table = Table("data_src", delimiter=",", header_rows=0)
        table.columns = [
            ("datasrc_id", ("char", "7")),
            ("authors", ("char", "257")),
            ("title", ("char", "257")),
            ("year", ("char", "5")),
            ("journal", ("char", "137")),
            ("vol_city", ("char", "17")),
            ("issue_state", ("char", "5")),
            ("start_page", ("char", "5")),
            ("end_page", ("char", "5")),
        ]
        self.create_and_install(new_file_name, table)

        # ABBREV table
        new_file_name = "abbrev.csv"
        table = Table("abbrev", delimiter=",", header_rows=0)
        table.columns = [
            ("ndb_no", ("char", "7")),
            ("shrt_desc", ("char", "60")),
            ("water", ("double", )),
            ("energ_kcal", ("int", )),
            ("protein", ("double", )),
            ("lipid_tot", ("double", )),
            ("ash", ("double", )),
            ("carbohydrt", ("double", )),
            ("fiber_td", ("double", )),
            ("sugar_tot", ("char", "6")),
            ("calcium", ("int", )),
            ("iron", ("double", )),
            ("magnesium", ("int", )),
            ("phosphorus", ("int", )),
            ("potassium", ("int", )),
            ("sodium", ("int", )),
            ("zinc", ("double", )),
            ("copper", ("double", )),
            ("manganese", ("double", )),
            ("selenium", ("double", )),
            ("vit_c", ("double", )),
            ("thiamin", ("double", )),
            ("riboflavin", ("double", )),
            ("niacin", ("double", )),
            ("panto_acid", ("double", )),
            ("vit_b6", ("double", )),
            ("folate_tot", ("int", )),
            ("folic_acid", ("int", )),
            ("food_folate", ("int", )),
            ("folate_dfe", ("int", )),
            ("choline_tot", ("double", )),
            ("vit_b12", ("double", )),
            ("vit_a_iu", ("int", )),
            ("vit_a_rae", ("int", )),
            ("retinol", ("int", )),
            ("alpha_carot", ("int", )),
            ("beta_carot", ("int", )),
            ("beta_crypt", ("int", )),
            ("lycopene", ("int", )),
            ("lut_zea", ("int", )),
            ("vit_e", ("double", )),
            ("vit_d_mcg", ("double", )),
            ("vit_d_iu", ("int", )),
            ("vit_k", ("double", )),
            ("fa_sat", ("double", )),
            ("fa_mono", ("double", )),
            ("fa_poly", ("double", )),
            ("cholestrl", ("int", )),
            ("gmwt_1", ("double", )),
            ("gmwt_desc1", ("char", "80")),
            ("gmwt_2", ("double", )),
            ("gmwt_desc2", ("char", "80")),
            ("refuse_pct", ("int", )),
        ]
        self.create_and_install(new_file_name, table)

Exemple #29

0

Afficher le fichier

Fichier : predicts.py Projet : KristinaRiemer/retriever

 def download(self, engine=None, debug=False):
     Script.download(self, engine, debug)
     engine = self.engine
     filename = "database.csv"
     tablename = "predicts_main"
     table = Table(str(tablename), delimiter=',')
     table.columns = [("Source_ID", ("char",)),
                      ("Reference", ("char",)),
                      ("Study_number", ("int",)),
                      ("Study_name", ("char",)),
                      ("SS", ("char",)),
                      ("Diversity_metric", ("char",)),
                      ("Diversity_metric_unit", ("char",)),
                      ("Diversity_metric_type", ("char",)),
                      ("Diversity_metric_is_effort_sensitive", ("char",)),
                      ("Diversity_metric_is_suitable_for_Chao", ("char",)),
                      ("Sampling_method", ("char",)),
                      ("Sampling_effort_unit", ("char",)),
                      ("Study_common_taxon", ("char",)),
                      ("Rank_of_study_common_taxon", ("char",)),
                      ("Site_number", ("int",)),
                      ("Site_name", ("char",)),
                      ("Block", ("char",)),
                      ("SSS", ("char",)),
                      ("SSB", ("char",)),
                      ("SSBS", ("char",)),
                      ("Sample_start_earliest", ("char",)),
                      ("Sample_end_latest", ("char",)),
                      ("Sample_midpoint", ("char",)),
                      ("Sample_date_resolution", ("char",)),
                      ("Max_linear_extent_metres", ("double",)),
                      ("Habitat_patch_area_square_metres", ("double",)),
                      ("Sampling_effort", ("double",)),
                      ("Rescaled_sampling_effort", ("double",)),
                      ("Habitat_as_described", ("char",)),
                      ("Predominant_land_use", ("char",)),
                      ("Source_for_predominant_land_use", ("char",)),
                      ("Use_intensity", ("char",)),
                      ("Km_to_nearest_edge_of_habitat", ("double",)),
                      ("Years_since_fragmentation_or_conversion", ("double",)),
                      ("Transect_details", ("char",)),
                      ("Coordinates_method", ("char",)),
                      ("Longitude", ("double",)),
                      ("Latitude", ("double",)),
                      ("Country_distance_metres", ("double",)),
                      ("Country", ("char")),
                      ("UN_subregion", ("char",)),
                      ("UN_region", ("char",)),
                      ("Ecoregion_distance_metres", ("double",)),
                      ("Ecoregion", ("char",)),
                      ("Biome", ("char",)),
                      ("Realm", ("char",)),
                      ("Hotspot", ("char",)),
                      ("Wilderness_area", ("char",)),
                      ("N_samples", ("double",)),
                      ("Taxon_number", ("double",)),
                      ("Taxon_name_entered", ("char",)),
                      ("Indication", ("char",)),
                      ("Parsed_name", ("char",)),
                      ("Taxon", ("char",)),
                      ("COL_ID", ("double",)),
                      ("Name_status", ("char",)),
                      ("Rank", ("char",)),
                      ("Kingdom", ("char",)),
                      ("Phylum", ("char",)),
                      ("Class", ("char",)),
                      ("Order", ("char",)),
                      ("Family", ("char",)),
                      ("Genus", ("char",)),
                      ("Species", ("char",)),
                      ("Best_guess_binomial", ("char",)),
                      ("Higher_taxa", ("char",)),
                      ("Higher_taxon", ("char",)),
                      ("Measurement", ("double",)),
                      ("Effort_corrected_measurement", ("double",))]
     engine.table = table
     if not os.path.isfile(engine.format_filename(filename)):
         engine.download_files_from_archive(self.urls["PREDICTS"],
                                            [filename],
                                            "zip",
                                            False,
                                            "download.zip")
     engine.create_table()
     engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #30

0

Afficher le fichier

Fichier : vertnet_mammals.py Projet : goelakash/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'vertnet_latest_mammals.csv'
        tablename = 'mammals'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("record_id", ("pk-auto",)),
            ("beginrecord", ("char",)),
            ("icode", ("char",)),
            ("title", ("char",)),
            ("citation", ("char",)),
            ("contact", ("char",)),
            ("email", ("char",)),
            ("emlrights", ("char",)),
            ("gbifdatasetid", ("char",)),
            ("gbifpublisherid", ("char",)),
            ("doi", ("char",)),
            ("migrator", ("char",)),
            ("networks", ("char",)),
            ("orgcountry", ("char",)),
            ("orgname", ("char",)),
            ("orgstateprovince", ("char",)),
            ("pubdate", ("char",)),
            ("source_url", ("char",)),
            ("iptrecordid", ("char",)),
            ("associatedmedia", ("char",)),
            ("associatedoccurrences", ("char",)),
            ("associatedorganisms", ("char",)),
            ("associatedreferences", ("char",)),
            ("associatedsequences", ("char",)),
            ("associatedtaxa", ("char",)),
            ("bed", ("char",)),
            ("behavior", ("char",)),
            ("catalognumber", ("char",)),
            ("continent", ("char",)),
            ("coordinateprecision", ("char",)),
            ("coordinateuncertaintyinmeters", ("char",)),
            ("country", ("char",)),
            ("countrycode", ("char",)),
            ("county", ("char",)),
            ("dateidentified", ("char",)),
            ("day", ("char",)),
            ("decimallatitude", ("char",)),
            ("decimallongitude", ("char",)),
            ("disposition", ("char",)),
            ("earliestageorloweststage", ("char",)),
            ("earliesteonorlowesteonothem", ("char",)),
            ("earliestepochorlowestseries", ("char",)),
            ("earliesteraorlowesterathem", ("char",)),
            ("earliestperiodorlowestsystem", ("char",)),
            ("enddayofyear", ("char",)),
            ("establishmentmeans", ("char",)),
            ("eventdate", ("char",)),
            ("eventid", ("char",)),
            ("eventremarks", ("char",)),
            ("eventtime", ("char",)),
            ("fieldnotes", ("char",)),
            ("fieldnumber", ("char",)),
            ("footprintspatialfit", ("char",)),
            ("footprintsrs", ("char",)),
            ("footprintwkt", ("char",)),
            ("formation", ("char",)),
            ("geodeticdatum", ("char",)),
            ("geologicalcontextid", ("char",)),
            ("georeferencedby", ("char",)),
            ("georeferenceddate", ("char",)),
            ("georeferenceprotocol", ("char",)),
            ("georeferenceremarks", ("char",)),
            ("georeferencesources", ("char",)),
            ("georeferenceverificationstatus", ("char",)),
            ("group", ("char",)),
            ("habitat", ("char",)),
            ("highergeography", ("char",)),
            ("highergeographyid", ("char",)),
            ("highestbiostratigraphiczone", ("char",)),
            ("identificationid", ("char",)),
            ("identificationqualifier", ("char",)),
            ("identificationreferences", ("char",)),
            ("identificationremarks", ("char",)),
            ("identificationverificationstatus", ("char",)),
            ("identifiedby", ("char",)),
            ("individualcount", ("char",)),
            ("island", ("char",)),
            ("islandgroup", ("char",)),
            ("latestageorhigheststage", ("char",)),
            ("latesteonorhighesteonothem", ("char",)),
            ("latestepochorhighestseries", ("char",)),
            ("latesteraorhighesterathem", ("char",)),
            ("latestperiodorhighestsystem", ("char",)),
            ("lifestage", ("char",)),
            ("lithostratigraphicterms", ("char",)),
            ("locality", ("char",)),
            ("locationaccordingto", ("char",)),
            ("locationid", ("char",)),
            ("locationremarks", ("char",)),
            ("lowestbiostratigraphiczone", ("char",)),
            ("materialsampleid", ("char",)),
            ("maximumdepthinmeters", ("char",)),
            ("maximumdistanceabovesurfaceinmeters", ("char",)),
            ("maximumelevationinmeters", ("char",)),
            ("member", ("char",)),
            ("minimumdepthinmeters", ("char",)),
            ("minimumdistanceabovesurfaceinmeters", ("char",)),
            ("minimumelevationinmeters", ("char",)),
            ("month", ("char",)),
            ("municipality", ("char",)),
            ("occurrenceid", ("char",)),
            ("occurrenceremarks", ("char",)),
            ("occurrencestatus", ("char",)),
            ("organismid", ("char",)),
            ("organismname", ("char",)),
            ("organismremarks", ("char",)),
            ("organismscope", ("char",)),
            ("othercatalognumbers", ("char",)),
            ("pointradiusspatialfit", ("char",)),
            ("preparations", ("char",)),
            ("previousidentifications", ("char",)),
            ("recordedby", ("char",)),
            ("recordnumber", ("char",)),
            ("reproductivecondition", ("char",)),
            ("samplingeffort", ("char",)),
            ("samplingprotocol", ("char",)),
            ("sex", ("char",)),
            ("startdayofyear", ("char",)),
            ("stateprovince", ("char",)),
            ("typestatus", ("char",)),
            ("verbatimcoordinates", ("char",)),
            ("verbatimcoordinatesystem", ("char",)),
            ("verbatimdepth", ("char",)),
            ("verbatimelevation", ("char",)),
            ("verbatimeventdate", ("char",)),
            ("verbatimlatitude", ("char",)),
            ("verbatimlocality", ("char",)),
            ("verbatimlongitude", ("char",)),
            ("verbatimsrs", ("char",)),
            ("waterbody", ("char",)),
            ("year", ("char",)),
            ("dctype", ("char",)),
            ("modified", ("char",)),
            ("language", ("char",)),
            ("license", ("char",)),
            ("rightsholder", ("char",)),
            ("accessrights", ("char",)),
            ("bibliographiccitation", ("char",)),
            ("dc_references", ("char",)),
            ("institutionid", ("char",)),
            ("collectionid", ("char",)),
            ("datasetid", ("char",)),
            ("institutioncode", ("char",)),
            ("collectioncode", ("char",)),
            ("datasetname", ("char",)),
            ("ownerinstitutioncode", ("char",)),
            ("basisofrecord", ("char",)),
            ("informationwithheld", ("char",)),
            ("datageneralizations", ("char",)),
            ("dynamicproperties", ("char",)),
            ("scientificnameid", ("char",)),
            ("namepublishedinid", ("char",)),
            ("scientificname", ("char",)),
            ("acceptednameusage", ("char",)),
            ("originalnameusage", ("char",)),
            ("namepublishedin", ("char",)),
            ("namepublishedinyear", ("char",)),
            ("higherclassification", ("char",)),
            ("kingdom", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("genus", ("char",)),
            ("subgenus", ("char",)),
            ("specificepithet", ("char",)),
            ("infraspecificepithet", ("char",)),
            ("taxonrank", ("char",)),
            ("verbatimtaxonrank", ("char",)),
            ("scientificnameauthorship", ("char",)),
            ("vernacularname", ("char",)),
            ("nomenclaturalcode", ("char",)),
            ("taxonomicstatus", ("char",)),
            ("keyname", ("char",)),
            ("haslicense", ("int",)),
            ("vntype", ("char",)),
            ("rank", ("int",)),
            ("mappable", ("int",)),
            ("hashid", ("char",)),
            ("hastypestatus", ("int",)),
            ("wascaptive", ("int",)),
            ("wasinvasive", ("int",)),
            ("hastissue", ("int",)),
            ("hasmedia", ("int",)),
            ("isfossil", ("int",)),
            ("haslength", ("int",)),
            ("haslifestage", ("int",)),
            ("hasmass", ("int",)),
            ("hassex", ("int",)),
            ("lengthinmm", ("double",)),
            ("massing", ("double",)),
            ("lengthunitsinferred", ("char",)),
            ("massunitsinferred", ("char",)),
            ("underivedlifestage", ("char",)),
            ("underivedsex", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename], [filename], filetype="zip", archivename="vertnet_latest_" + str(tablename))
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #31

0

Afficher le fichier

Fichier : gentry.py Projet : imclab/retriever

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        
        self.engine.auto_create_table(Table("sites"), url=self.urls["sites"])
        self.engine.insert_data_from_url(self.urls["sites"])
              
        self.engine.download_file(self.urls["stems"], "all_Excel.zip")
        local_zip = zipfile.ZipFile(self.engine.format_filename("all_Excel.zip"))        
        filelist = local_zip.namelist()
        local_zip.close()        
        self.engine.download_files_from_archive(self.urls["stems"], filelist)
        
        filelist = [os.path.basename(filename) for filename in filelist]
        
        lines = []
        tax = []
        for filename in filelist:
            print "Extracting data from " + filename + "..."
            book = xlrd.open_workbook(self.engine.format_filename(filename))
            sh = book.sheet_by_index(0)
            rows = sh.nrows
            cn = {'stems': []}
            n = 0
            for c in sh.row(0):
                if not Excel.empty_cell(c):
                    cid = Excel.cell_value(c).lower()
                    # line number column is sometimes named differently
                    if cid in ["sub", "number"]:
                        cid = "line"
                    # the "number of individuals" column is named in various
                    # different ways; they always at least contain "nd"
                    if "nd" in cid:
                        cid = "count"
                    # if column is a stem, add it to the list of stems;
                    # otherwise, make note of the column name/number
                    if "stem" in cid:
                        cn["stems"].append(n)
                    else:
                        cn[cid] = n
                n += 1
            # sometimes, a data file does not contain a liana or count column
            if not "liana" in cn.keys():
                cn["liana"] = -1
            if not "count" in cn.keys():
                cn["count"] = -1
            for i in range(1, rows):
                row = sh.row(i)
                cellcount = len(row)
                # make sure the row is real, not just empty cells
                if cellcount > 4 and not Excel.empty_cell(row[0]):
                    try:
                        this_line = {}
                        
                        def format_value(s):
                            s = Excel.cell_value(s)
                            return str(s).title().replace("\\", "/").replace('"', '')
                        
                        # get the following information from the appropriate columns
                        for i in ["line", "family", "genus", "species", 
                                  "liana", "count"]:
                            if cn[i] > -1:
                                this_line[i] = format_value(row[cn[i]])
                                if this_line[i] == '`':
                                    this_line[i] = 1

                        this_line["stems"] = [Excel.cell_value(row[c]) 
                                              for c in cn["stems"]
                                              if not Excel.empty_cell(row[c])]
                        this_line["site"] = filename[0:-4]
                        
                        lines.append(this_line)
                        
                        # Check how far the species is identified
                        full_id = 0
                        if len(this_line["species"]) < 3:
                            if len(this_line["genus"]) < 3:
                                id_level = "family"
                            else:
                                id_level = "genus"
                        else:
                            id_level = "species"
                            full_id = 1
                        tax.append((this_line["family"], 
                                    this_line["genus"], 
                                    this_line["species"].lower().replace('\\', '').replace('"', ''), 
                                    id_level, 
                                    str(full_id)))
                    except:
                        raise
                        pass                    
        
        tax = sorted(tax, key=lambda group: group[0] + " " + group[1] + " " + group[2])
        unique_tax = []
        tax_dict = dict()
        tax_count = 0
        
        # Get all unique families/genera/species
        for group in tax:
            if not (group in unique_tax):
                unique_tax.append(group)
                tax_count += 1
                tax_dict[group[0:3]] = tax_count
                if tax_count % 10 == 0:
                    msg = "Generating taxonomic groups: " + str(tax_count) + " / " + str(TAX_GROUPS)
                    sys.stdout.write(msg + "\b" * len(msg))
        print "Generating taxonomic groups: " + str(TAX_GROUPS) + " / " + str(TAX_GROUPS)
        
        
        # Create species table
        table = Table("species", delimiter=",")
        table.columns=[("species_id"            ,   ("pk-int",)    ),
                       ("family"                ,   ("char", )    ),
                       ("genus"                 ,   ("char", )    ),
                       ("species"               ,   ("char", )    ),
                       ("id_level"              ,   ("char", 10)    ),
                       ("full_id"               ,   ("bool",)       )]

        data = [','.join([str(tax_dict[group[:3]])] + ['"%s"' % g for g in group]) 
                for group in unique_tax]
        table.pk = 'species_id'
        table.contains_pk = True
        
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create stems table
        table = Table("stems", delimiter=",", contains_pk=False)
        table.columns=[("stem_id"               ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("stem"                  ,   ("double",)     )]
        stems = []
        counts = []
        for line in lines:
            try:
                liana = line["liana"]
            except KeyError:
                liana = ""
            species_info = [line["line"], 
                            tax_dict[(line["family"], 
                                      line["genus"], 
                                      line["species"].lower())],
                            line["site"],
                            liana
                            ]
            try:
                counts.append([str(value) for value in species_info + [line["count"]]])
            except KeyError:
                pass

            for i in line["stems"]:
                stem = species_info + [i]
                stems.append([str(value) for value in stem])
            
        data = [','.join(stem) for stem in stems]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
        
        
        # Create counts table
        table = Table("counts", delimiter=",", contains_pk=False)
        table.columns=[("count_id"              ,   ("pk-auto",)    ),
                       ("line"                  ,   ("int",)        ),
                       ("species_id"            ,   ("int",)        ),
                       ("site_code"             ,   ("char", 12)    ),
                       ("liana"                 ,   ("char", 10)    ),
                       ("count"                 ,   ("double",)     )]
        data = [','.join(count) for count in counts]
        self.engine.table = table
        self.engine.create_table()
        self.engine.add_to_table(data)
            
        return self.engine

Exemple #32

0

Afficher le fichier

    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        filename = 'vertnet_latest_reptiles.csv'
        tablename = 'reptiles'

        table = Table(str(tablename), delimiter=',')
        table.columns = [
            ("record_id", ("pk-auto",)),
            ("beginrecord", ("char",)),
            ("icode", ("char",)),
            ("title", ("char",)),
            ("citation", ("char",)),
            ("contact", ("char",)),
            ("email", ("char",)),
            ("emlrights", ("char",)),
            ("gbifdatasetid", ("char",)),
            ("gbifpublisherid", ("char",)),
            ("doi", ("char",)),
            ("migrator", ("char",)),
            ("networks", ("char",)),
            ("orgcountry", ("char",)),
            ("orgname", ("char",)),
            ("orgstateprovince", ("char",)),
            ("pubdate", ("char",)),
            ("source_url", ("char",)),
            ("iptrecordid", ("char",)),
            ("associatedmedia", ("char",)),
            ("associatedoccurrences", ("char",)),
            ("associatedorganisms", ("char",)),
            ("associatedreferences", ("char",)),
            ("associatedsequences", ("char",)),
            ("associatedtaxa", ("char",)),
            ("bed", ("char",)),
            ("behavior", ("char",)),
            ("catalognumber", ("char",)),
            ("continent", ("char",)),
            ("coordinateprecision", ("char",)),
            ("coordinateuncertaintyinmeters", ("char",)),
            ("country", ("char",)),
            ("countrycode", ("char",)),
            ("county", ("char",)),
            ("dateidentified", ("char",)),
            ("day", ("char",)),
            ("decimallatitude", ("char",)),
            ("decimallongitude", ("char",)),
            ("disposition", ("char",)),
            ("earliestageorloweststage", ("char",)),
            ("earliesteonorlowesteonothem", ("char",)),
            ("earliestepochorlowestseries", ("char",)),
            ("earliesteraorlowesterathem", ("char",)),
            ("earliestperiodorlowestsystem", ("char",)),
            ("enddayofyear", ("char",)),
            ("establishmentmeans", ("char",)),
            ("eventdate", ("char",)),
            ("eventid", ("char",)),
            ("eventremarks", ("char",)),
            ("eventtime", ("char",)),
            ("fieldnotes", ("char",)),
            ("fieldnumber", ("char",)),
            ("footprintspatialfit", ("char",)),
            ("footprintsrs", ("char",)),
            ("footprintwkt", ("char",)),
            ("formation", ("char",)),
            ("geodeticdatum", ("char",)),
            ("geologicalcontextid", ("char",)),
            ("georeferencedby", ("char",)),
            ("georeferenceddate", ("char",)),
            ("georeferenceprotocol", ("char",)),
            ("georeferenceremarks", ("char",)),
            ("georeferencesources", ("char",)),
            ("georeferenceverificationstatus", ("char",)),
            ("group", ("char",)),
            ("habitat", ("char",)),
            ("highergeography", ("char",)),
            ("highergeographyid", ("char",)),
            ("highestbiostratigraphiczone", ("char",)),
            ("identificationid", ("char",)),
            ("identificationqualifier", ("char",)),
            ("identificationreferences", ("char",)),
            ("identificationremarks", ("char",)),
            ("identificationverificationstatus", ("char",)),
            ("identifiedby", ("char",)),
            ("individualcount", ("char",)),
            ("island", ("char",)),
            ("islandgroup", ("char",)),
            ("latestageorhigheststage", ("char",)),
            ("latesteonorhighesteonothem", ("char",)),
            ("latestepochorhighestseries", ("char",)),
            ("latesteraorhighesterathem", ("char",)),
            ("latestperiodorhighestsystem", ("char",)),
            ("lifestage", ("char",)),
            ("lithostratigraphicterms", ("char",)),
            ("locality", ("char",)),
            ("locationaccordingto", ("char",)),
            ("locationid", ("char",)),
            ("locationremarks", ("char",)),
            ("lowestbiostratigraphiczone", ("char",)),
            ("materialsampleid", ("char",)),
            ("maximumdepthinmeters", ("char",)),
            ("maximumdistanceabovesurfaceinmeters", ("char",)),
            ("maximumelevationinmeters", ("char",)),
            ("member", ("char",)),
            ("minimumdepthinmeters", ("char",)),
            ("minimumdistanceabovesurfaceinmeters", ("char",)),
            ("minimumelevationinmeters", ("char",)),
            ("month", ("char",)),
            ("municipality", ("char",)),
            ("occurrenceid", ("char",)),
            ("occurrenceremarks", ("char",)),
            ("occurrencestatus", ("char",)),
            ("organismid", ("char",)),
            ("organismname", ("char",)),
            ("organismremarks", ("char",)),
            ("organismscope", ("char",)),
            ("othercatalognumbers", ("char",)),
            ("pointradiusspatialfit", ("char",)),
            ("preparations", ("char",)),
            ("previousidentifications", ("char",)),
            ("recordedby", ("char",)),
            ("recordnumber", ("char",)),
            ("reproductivecondition", ("char",)),
            ("samplingeffort", ("char",)),
            ("samplingprotocol", ("char",)),
            ("sex", ("char",)),
            ("startdayofyear", ("char",)),
            ("stateprovince", ("char",)),
            ("typestatus", ("char",)),
            ("verbatimcoordinates", ("char",)),
            ("verbatimcoordinatesystem", ("char",)),
            ("verbatimdepth", ("char",)),
            ("verbatimelevation", ("char",)),
            ("verbatimeventdate", ("char",)),
            ("verbatimlatitude", ("char",)),
            ("verbatimlocality", ("char",)),
            ("verbatimlongitude", ("char",)),
            ("verbatimsrs", ("char",)),
            ("waterbody", ("char",)),
            ("year", ("char",)),
            ("dctype", ("char",)),
            ("modified", ("char",)),
            ("language", ("char",)),
            ("license", ("char",)),
            ("rightsholder", ("char",)),
            ("accessrights", ("char",)),
            ("bibliographiccitation", ("char",)),
            ("dc_references", ("char",)),
            ("institutionid", ("char",)),
            ("collectionid", ("char",)),
            ("datasetid", ("char",)),
            ("institutioncode", ("char",)),
            ("collectioncode", ("char",)),
            ("datasetname", ("char",)),
            ("ownerinstitutioncode", ("char",)),
            ("basisofrecord", ("char",)),
            ("informationwithheld", ("char",)),
            ("datageneralizations", ("char",)),
            ("dynamicproperties", ("char",)),
            ("scientificnameid", ("char",)),
            ("namepublishedinid", ("char",)),
            ("scientificname", ("char",)),
            ("acceptednameusage", ("char",)),
            ("originalnameusage", ("char",)),
            ("namepublishedin", ("char",)),
            ("namepublishedinyear", ("char",)),
            ("higherclassification", ("char",)),
            ("kingdom", ("char",)),
            ("phylum", ("char",)),
            ("class", ("char",)),
            ("order", ("char",)),
            ("family", ("char",)),
            ("genus", ("char",)),
            ("subgenus", ("char",)),
            ("specificepithet", ("char",)),
            ("infraspecificepithet", ("char",)),
            ("taxonrank", ("char",)),
            ("verbatimtaxonrank", ("char",)),
            ("scientificnameauthorship", ("char",)),
            ("vernacularname", ("char",)),
            ("nomenclaturalcode", ("char",)),
            ("taxonomicstatus", ("char",)),
            ("keyname", ("char",)),
            ("haslicense", ("int",)),
            ("vntype", ("char",)),
            ("rank", ("int",)),
            ("mappable", ("int",)),
            ("hashid", ("char",)),
            ("hastypestatus", ("int",)),
            ("wascaptive", ("int",)),
            ("wasinvasive", ("int",)),
            ("hastissue", ("int",)),
            ("hasmedia", ("int",)),
            ("isfossil", ("int",)),
            ("haslength", ("int",)),
            ("haslifestage", ("int",)),
            ("hasmass", ("int",)),
            ("hassex", ("int",)),
            ("lengthinmm", ("double",)),
            ("massing", ("double",)),
            ("lengthunitsinferred", ("char",)),
            ("massunitsinferred", ("char",)),
            ("underivedlifestage", ("char",)),
            ("underivedsex", ("char",))]

        engine.table = table
        if not os.path.isfile(engine.format_filename(filename)):
            engine.download_files_from_archive(self.urls[tablename],
                                               [filename],
                                               "zip",
                                               False,
                                               "vertnet_latest_" + str(tablename))
        engine.create_table()
        engine.insert_data_from_file(engine.format_filename(str(filename)))

Exemple #33

0

Afficher le fichier

Fichier : bbs50stop.py Projet : imclab/retriever

    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)
            
            engine = self.engine
            
            # Routes table            
            if not os.path.isfile(engine.format_filename("routes_new.csv")):
                engine.download_files_from_archive(self.urls["routes"],
                                                   ["routes.csv"])
                read = open(engine.format_filename("routes.csv"), "rb")
                write = open(engine.format_filename("routes_new.csv"), "wb")
                print "Cleaning routes data..."
                write.write(read.readline())
                for line in read:
                    values = line.split(',')
                    v = Decimal(values[5])
                    if  v > 0:
                        values[5] = str(v * Decimal("-1"))
                    write.write(','.join(str(value) for value in values))
                write.close()
                read.close()
                
            engine.auto_create_table(Table("routes", cleanup=Cleanup()), 
                                     filename="routes_new.csv")
                
            engine.insert_data_from_file(engine.format_filename("routes_new.csv"))

            
            # Weather table                
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"], 
                                                   ["weather.csv"])            
                read = open(engine.format_filename("weather.csv"), "rb")
                write = open(engine.format_filename("weather_new.csv"), "wb")
                print "Cleaning weather data..."
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:
                        
                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()
            
            engine.auto_create_table(Table("weather", pk="RouteDataId", cleanup=Cleanup()), 
                                     filename="weather_new.csv")
            engine.insert_data_from_file(engine.format_filename("weather_new.csv"))
            
            
            # Species table
            table = Table("species", pk=False, delimiter=',')
            
            table.columns=[("species_id"            ,   ("pk-auto",)        ),
                           ("AOU"                   ,   ("int",)            ),
                           ("genus"                 ,   ("char",30)         ),
                           ("species"               ,   ("char",50)         ),
                           ("subspecies"            ,   ("char",30)         ),
                           ("id_to_species"         ,   ("bool",)           )]
            
            engine.table = table
            engine.create_table()
            
            engine.download_file(self.urls["species"], "SpeciesList.txt")
            species_list = open(engine.format_filename("SpeciesList.txt"), "rb")
            for n in range(8):
                species_list.readline()
            
            rows = []
            for line in species_list:
                if line and len(line) > 273:
                    latin_name = line[273:].split()
                    if len(latin_name) < 2:
                        # If there's no species given, add "None" value
                        latin_name.append("None")
                    subspecies = ' '.join(latin_name[2:]) if len(latin_name) > 2 else "None"                    
                    id_to_species = "1" if latin_name[1] != "None" else "0"
                    if latin_name[1] == "sp.":
                        latin_name[1] = "None"
                        id_to_species = "0"
                    if ("x" in latin_name or "/" in latin_name
                        or "/" in subspecies or "or" in latin_name):
                        # Hybrid species or only identified to a group of species
                        latin_name[1] = ' '.join(latin_name[1:])
                        subspecies = "None"
                        id_to_species = "0"
                    
                    rows.append(','.join([
                                          line.split()[1], 
                                          latin_name[0],
                                          latin_name[1],
                                          subspecies,
                                          id_to_species
                                          ]))
                    
            engine.add_to_table(rows)
            
            species_list.close()
            
            
            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])
            def regioncodes_cleanup(value, engine):
                replace = {chr(225):"a", chr(233):"e", chr(237):"i", chr(243):"o"}
                newvalue = str(value)
                for key in replace.keys():
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue
            table.cleanup = Cleanup(regioncodes_cleanup)
            
            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("regioncode"            ,   ("int",)        ),
                           ("regionname"            ,   ("char",30)     )]
            
            engine.table = table
            engine.create_table()
                                    
            engine.insert_data_from_url(self.urls["region_codes"])
                        
            # Counts table
            table = Table("counts", delimiter=',')
            table.columns=[("countrynum"            ,   ("int",)        ),
                           ("statenum"              ,   ("int",)        ),
                           ("Route"                 ,   ("int",)        ),
                           ("RPID"                  ,   ("int",)        ),
                           ("year"                  ,   ("int",)        ),
                           ("AOU"                   ,   ("int",)        ),
                           ("Stop1"                 ,   ("int",)        ),
                           ("Stop2"                 ,   ("int",)        ),
                           ("Stop3"                 ,   ("int",)        ),
                           ("Stop4"                 ,   ("int",)        ),
                           ("Stop5"                 ,   ("int",)        ),
                           ("Stop6"                 ,   ("int",)        ),
                           ("Stop7"                 ,   ("int",)        ),
                           ("Stop8"                 ,   ("int",)        ),
                           ("Stop9"                 ,   ("int",)        ),
                           ("Stop10"                ,   ("int",)        ),
                           ("Stop11"                ,   ("int",)        ),
                           ("Stop12"                ,   ("int",)        ),
                           ("Stop13"                ,   ("int",)        ),
                           ("Stop14"                ,   ("int",)        ),
                           ("Stop15"                ,   ("int",)        ),
                           ("Stop16"                ,   ("int",)        ),
                           ("Stop17"                ,   ("int",)        ),
                           ("Stop18"                ,   ("int",)        ),
                           ("Stop19"                ,   ("int",)        ),
                           ("Stop20"                ,   ("int",)        ),
                           ("Stop21"                ,   ("int",)        ),
                           ("Stop22"                ,   ("int",)        ),
                           ("Stop23"                ,   ("int",)        ),
                           ("Stop24"                ,   ("int",)        ),
                           ("Stop25"                ,   ("int",)        ),
                           ("Stop26"                ,   ("int",)        ),
                           ("Stop27"                ,   ("int",)        ),
                           ("Stop28"                ,   ("int",)        ),
                           ("Stop29"                ,   ("int",)        ),
                           ("Stop30"                ,   ("int",)        ),
                           ("Stop31"                ,   ("int",)        ),
                           ("Stop32"                ,   ("int",)        ),
                           ("Stop33"                ,   ("int",)        ),
                           ("Stop34"                ,   ("int",)        ),
                           ("Stop35"                ,   ("int",)        ),
                           ("Stop36"                ,   ("int",)        ),
                           ("Stop37"                ,   ("int",)        ),
                           ("Stop38"                ,   ("int",)        ),
                           ("Stop39"                ,   ("int",)        ),
                           ("Stop40"                ,   ("int",)        ),
                           ("Stop41"                ,   ("int",)        ),
                           ("Stop42"                ,   ("int",)        ),
                           ("Stop43"                ,   ("int",)        ),
                           ("Stop44"                ,   ("int",)        ),
                           ("Stop45"                ,   ("int",)        ),
                           ("Stop46"                ,   ("int",)        ),
                           ("Stop47"                ,   ("int",)        ),
                           ("Stop48"                ,   ("int",)        ),
                           ("Stop49"                ,   ("int",)        ),
                           ("Stop50"                ,   ("int",)        )]
            
            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1,11):
                part = str(part)
                try:
                    print "Inserting data from part " + part + "..."
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] + 
                                                        "Fifty" + part + ".exe", 
                                                        ["fifty" + part + ".csv"])
                    except:               
                        print "Failed bulk insert on " + part + ", inserting manually."
                        engine.connection.rollback()
                        engine.table.cleanup = Cleanup(correct_invalid_value,
                                                       nulls=['*'])
                        engine.insert_data_from_archive(self.urls["counts"] + 
                                                        "Fifty" + part + ".exe", 
                                                        ["fifty" + part + ".csv"])
                            
                except:
                    print "There was an error in part " + part + "."
                    raise
            
            
        except zipfile.BadZipfile:            
            print "There was an unexpected error in the Breeding Bird Survey archives."
            raise    
        
        return engine