Example #1
0
    def to_csv(self, sort=True, path=None):
        # Due to Cyclic imports we can not move this import to the top
        from retriever.lib.engine_tools import sort_csv

        for table_name in self.script_table_registry[self.script.name]:

            csv_file_output = os.path.normpath(
                os.path.join(path if path else '', table_name[0] + '.csv'))
            csv_file = open_fw(csv_file_output)
            csv_writer = open_csvw(csv_file)
            self.get_cursor()
            self.set_engine_encoding()
            self.cursor.execute("SELECT * FROM  {};".format(table_name[0]))
            row = self.cursor.fetchone()
            column_names = [
                u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description
            ]
            csv_writer.writerow(column_names)
            while row is not None:
                csv_writer.writerow(row)
                row = self.cursor.fetchone()
            csv_file.close()
            if sort:
                sort_csv(csv_file_output)
        self.disconnect()
Example #2
0
 def create_table(self):
     """Create the table by creating an empty csv file"""
     self.auto_column_number = 1
     self.file = open_fw(self.table_name())
     self.output_file = open_csvw(self.file)
     self.output_file.writerow([u'{}'.format(val) for val in self.table.get_insert_columns(join=False, create=True)])
     self.table_names.append((self.file, self.table_name()))
Example #3
0
    def create_table(self):
        """Create the table by creating an empty csv file"""
        self.auto_column_number = 1
        table_path = os.path.join(self.opts["data_dir"], self.table_name())
        self.file = open_fw(table_path)
        self.output_file = open_csvw(self.file)
        column_list = self.table.get_insert_columns(join=False, create=True)
        self.output_file.writerow([u'{}'.format(val) for val in column_list])
        self.table_names.append((self.file, table_path))

        # Register all tables created to enable
        # testing python files having custom download function
        Engine.register_tables(self)
Example #4
0
def sort_csv(filename, encoding=ENCODING):
    """Sort CSV rows minus the header and return the file.

    Function is used for only testing and can handle the file of the size.
    """
    filename = os.path.normpath(filename)
    input_file = open_fr(filename, encoding)
    csv_reader_infile = csv.reader(input_file, escapechar="\\")
    #  write the data to a temporary file and sort it
    temp_path = os.path.normpath("tempfile")
    temp_file = open_fw(temp_path, encoding)

    csv_writer = open_csvw(temp_file)
    i = 0
    infields = None
    for row in csv_reader_infile:
        if i == 0:
            # The first entry is the header line
            infields = row
            i += 1
        else:
            csv_writer.writerow(row)
    input_file.close()
    temp_file.close()

    # sort the temp file
    sorted_txt = sort_file(temp_path, encoding)
    tmp = open_fr(sorted_txt, encoding)
    in_txt = csv.reader(tmp, delimiter=',', escapechar="\\")
    csv_file = open_fw(filename, encoding)
    csv_writer = open_csvw(csv_file)
    csv_writer.writerow(infields)
    csv_writer.writerows(in_txt)
    tmp.close()
    csv_file.close()
    os.remove(os.path.normpath(temp_path))
    return filename
Example #5
0
    def create_table(self):
        """Create the table by creating an empty csv file"""
        self.auto_column_number = 1
        self.file = open_fw(self.table_name())
        self.output_file = open_csvw(self.file)
        column_list = self.table.get_insert_columns(join=False, create=True)
        self.output_file.writerow([u'{}'.format(val) for val in column_list])
        self.table_names.append((self.file, self.table_name()))

        # Register all tables created to enable
        # testing python files having custom download function
        if self.script.name not in self.script_table_registry:
            self.script_table_registry[self.script.name] = []
        self.script_table_registry[self.script.name].append(
            (self.table_name(), self.table))
Example #6
0
    def to_csv(self,
               sort=True,
               path=None,
               select_columns=None,
               select_table=None):
        """Create a CSV file from the a data store.

        sort flag to create a sorted file,
        path to write the flag else write to the PWD,
        select_columns flag is used by large files to select
        columns data and has SELECT LIMIT 3.
        """
        # Due to Cyclic imports we can not move this import to the top
        from retriever.lib.engine_tools import sort_csv

        for table_name in self.script_table_registry[self.script.name]:

            csv_file_output = os.path.normpath(
                os.path.join(path if path else '', table_name[0] + '.csv'))
            self.get_cursor()
            self.set_engine_encoding()
            csv_file = open_fw(csv_file_output, encoding=self.encoding)
            csv_writer = open_csvw(csv_file)

            limit = ""
            cols = "*"
            if select_columns:
                limit = "LIMIT 3"
                cols = ",".join(select_columns)
            sql_query = "SELECT {cols} FROM  {tab} {limit};"
            self.cursor.execute(
                sql_query.format(cols=cols, tab=table_name[0], limit=limit))
            row = self.cursor.fetchone()
            column_names = [
                u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description
            ]
            csv_writer.writerow(column_names)
            while row is not None:
                csv_writer.writerow(row)
                row = self.cursor.fetchone()
            csv_file.close()
            if sort:
                sort_csv(csv_file_output)
        self.disconnect()
Example #7
0
 def to_csv(self):
     # Due to Cyclic imports we can not move this import to the top
     from retriever.lib.engine_tools import sort_csv
     for _ in list(self.script.urls.keys()):
         table_name = self.table_name()
         csv_file_output = os.path.normpath(table_name + '.csv')
         csv_file = open_fw(csv_file_output)
         csv_writer = open_csvw(csv_file)
         self.get_cursor()
         self.set_engine_encoding()
         self.cursor.execute("SELECT * FROM  {};".format(table_name))
         row = self.cursor.fetchone()
         colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description]
         csv_writer.writerow(colnames)
         while row is not None:
             csv_writer.writerow(row)
             row = self.cursor.fetchone()
         csv_file.close()
         sort_csv(csv_file_output)
     self.disconnect()
Example #8
0
 def to_csv(self, sort=True):
     # Due to Cyclic imports we can not move this import to the top
     from retriever.lib.engine_tools import sort_csv
     for table_n in list(self.script.tables.keys()):
         table_name = self.table_name(name=table_n)
         csv_file_output = os.path.normpath(table_name + '.csv')
         csv_file = open_fw(csv_file_output)
         csv_writer = open_csvw(csv_file)
         self.get_cursor()
         self.set_engine_encoding()
         self.cursor.execute("SELECT * FROM  {};".format(table_name))
         row = self.cursor.fetchone()
         colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description]
         csv_writer.writerow(colnames)
         while row is not None:
             csv_writer.writerow(row)
             row = self.cursor.fetchone()
         csv_file.close()
         if sort:
             sort_csv(csv_file_output)
     self.disconnect()
Example #9
0
    def to_csv(self, sort=True, path=None, select_columns=None):
        """Create a CSV file from the a data store.

        sort flag to create a sorted file,
        path to write the flag else write to the PWD,
        select_columns flag is used by large files to select
        columns data and has SELECT LIMIT 3.
        """
        # Due to Cyclic imports we can not move this import to the top
        from retriever.lib.engine_tools import sort_csv

        for table_name in self.script_table_registry[self.script.name]:

            csv_file_output = os.path.normpath(os.path.join(path if path else '',
                                                            table_name[0] + '.csv'))
            csv_file = open_fw(csv_file_output)
            csv_writer = open_csvw(csv_file)
            self.get_cursor()
            self.set_engine_encoding()
            limit = ""
            cols = "*"
            if select_columns:
                limit = "LIMIT 3"
                cols = ",".join(select_columns)
            sql_query = "SELECT {cols} FROM  {tab} {limit};"
            self.cursor.execute(sql_query.format(cols=cols, tab=table_name[0], limit=limit))
            row = self.cursor.fetchone()
            column_names = [u'{}'.format(tuple_i[0])
                            for tuple_i in self.cursor.description]
            csv_writer.writerow(column_names)
            while row is not None:
                csv_writer.writerow(row)
                row = self.cursor.fetchone()
            csv_file.close()
            if sort:
                sort_csv(csv_file_output)
        self.disconnect()
Example #10
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = ["observation_id",
                         "update_datetime",
                         "site_id",
                         "latitude",
                         "longitude",
                         "elevation_in_meters",
                         "state",
                         "species_id",
                         "genus",
                         "species",
                         "common_name",
                         "kingdom",
                         "individual_id",
                         "phenophase_id",
                         "phenophase_description",
                         "observation_date",
                         "day_of_year",
                         "phenophase_status",
                         "intensity_category_id",
                         "intensity_value",
                         "abundance_value"
                         ]

        columns = [("record_id", ("pk-auto",)),
                   ("observation_id", ("int",)),  # subsequently refered to as "status record"
                   ("update_datetime", ("char",)),
                   ("site_id", ("int",)),
                   ("latitude", ("double",)),
                   ("longitude", ("double",)),
                   ("elevation_in_meters", ("char",)),
                   ("state", ("char",)),
                   ("species_id", ("int",)),
                   ("genus", ("char",)),
                   ("species", ("char",)),
                   ("common_name", ("char",)),
                   ("kingdom", ("char",)),  # skip kingdom
                   ("individual_id", ("char",)),
                   ("phenophase_id", ("int",)),
                   ("phenophase_description", ("char",)),
                   ("observation_date", ("char",)),
                   ("day_of_year", ("char",)),
                   ("phenophase_status", ("char",)),
                   ("intensity_category_id", ("char",)),
                   ("intensity_value", ("char",)),
                   ("abundance_value", ("char",))
                   ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine
Example #11
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = [
            "observation_id", "update_datetime", "site_id", "latitude",
            "longitude", "elevation_in_meters", "state", "species_id", "genus",
            "species", "common_name", "kingdom", "individual_id",
            "phenophase_id", "phenophase_description", "observation_date",
            "day_of_year", "phenophase_status", "intensity_category_id",
            "intensity_value", "abundance_value"
        ]

        columns = [
            ("record_id", ("pk-auto", )),
            ("observation_id",
             ("int", )),  # subsequently refered to as "status record"
            ("update_datetime", ("char", )),
            ("site_id", ("int", )),
            ("latitude", ("double", )),
            ("longitude", ("double", )),
            ("elevation_in_meters", ("char", )),
            ("state", ("char", )),
            ("species_id", ("int", )),
            ("genus", ("char", )),
            ("species", ("char", )),
            ("common_name", ("char", )),
            ("kingdom", ("char", )),  # skip kingdom
            ("individual_id", ("char", )),
            ("phenophase_id", ("int", )),
            ("phenophase_description", ("char", )),
            ("observation_date", ("char", )),
            ("day_of_year", ("char", )),
            ("phenophase_status", ("char", )),
            ("intensity_category_id", ("char", )),
            ("intensity_value", ("char", )),
            ("abundance_value", ("char", ))
        ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date),
                                           endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date),
                                           endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(),
                                 key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations',
                      delimiter=',',
                      pk='record_id',
                      contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine
Example #12
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"],
                                  "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow([
            "Number", "Family", "Binomial", "Wood_Density", "Region",
            "Reference_Number"
        ])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int", )), ("Family", ("char", )),
                         ("Binomial", ("char", )),
                         ("Wood_Density", ("double", )),
                         ("Region", ("char", )),
                         ("Reference_Number", ("int", ))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [
                to_str(column_value.value, object_encoding=sys.stdout)
                for column_value in row
            ]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int", )),
                         ("Reference", ("char", ))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine
Example #13
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int",)),
                         ("Family", ("char",)),
                         ("Binomial", ("char",)),
                         ("Wood_Density", ("double",)),
                         ("Region", ("char",)),
                         ("Reference_Number", ("int",))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine
Example #14
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # Complete Plants Checklist
        file_name = "complete_plant_checklist.csv"
        table_name = "complete_plant_checklist"
        complete_plant_url = "https://plants.sc.egov.usda.gov/java/downloadData?fileName=plantlst.txt&static=true"
        self.engine.download_file(complete_plant_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("common_name", ("char", "42")),
            ("family", ("char", "30")),
        ]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # Symbols for Unknown Plants
        file_name = "symbols_unknown_plants.csv"
        table_name = "unknown_plants"
        unknown_plants_url = "https://plants.sc.egov.usda.gov/Data/unknown_plants.txt"
        self.engine.download_file(unknown_plants_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [("symbol", ("char", "7")),
                         ("common_name", ("char", "56"))]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # State PLANTS Checklist
        base_url = "https://plants.sc.egov.usda.gov/"
        state_plant_checklist_base_url = "{base}java/stateDownload?statefips={id}"
        state_plant_checklist_file = "all_state_plant_checklist.csv"
        table_name = "state_plant_checklist"
        state_plant_checklist = [
            ("US01", "Alabama", "US"),
            ("US02", "Alaska", "US"),
            ("US05", "Arkansas", "US"),
            ("US04", "Arizona", "US"),
            ("US06", "California", "US"),
            ("US08", "Colorado", "US"),
            ("US09", "Connecticut", "US"),
            ("US10", "Delaware", "US"),
            ("US11", "District of Columbia", "US"),
            ("US12", "Florida", "US"),
            ("US13", "Georgia", "US"),
            ("US15", "Hawaii", "US"),
            ("US16", "Idaho", "US"),
            ("US17", "Illinois", "US"),
            ("US18", "Indiana", "US"),
            ("US19", "Iowa", "US"),
            ("US20", "Kansas", "US"),
            ("US21", "Kentucky", "US"),
            ("US22", "Louisiana", "US"),
            ("US23", "Maine", "US"),
            ("US24", "Maryland", "US"),
            ("US25", "Massachusetts", "US"),
            ("US26", "Michigan", "US"),
            ("US27", "Minnesota", "US"),
            ("US28", "Mississippi", "US"),
            ("US29", "Missouri", "US"),
            ("US30", "Montana", "US"),
            ("US31", "Nebraska", "US"),
            ("US32", "Nevada", "US"),
            ("US33", "New Hampshire", "US"),
            ("US34", "New Jersey", "US"),
            ("US35", "New Mexico", "US"),
            ("US36", "New York", "US"),
            ("US37", "North Carolina", "US"),
            ("US38", "North Dakota", "US"),
            ("US39", "Ohio", "US"),
            ("US40", "Oklahoma", "US"),
            ("US41", "Oregon", "US"),
            ("US42", "Pennsylvania", "US"),
            ("US44", "Rhode Island", "US"),
            ("US45", "South Carolina", "US"),
            ("US46", "South Dakota", "US"),
            ("US47", "Tennessee", "US"),
            ("US48", "Texas", "US"),
            ("US49", "Utah", "US"),
            ("US50", "Vermont", "US"),
            ("US51", "Virginia", "US"),
            ("US53", "Washington", "US"),
            ("US54", "West Virginia", "US"),
            ("US55", "Wisconsin", "US"),
            ("US56", "Wyoming", "US"),
            ("US72", "Puerto Rico", "US"),
            ("US78", "Virgin Islands", "US"),
            ("CA01", "Alberta", "Canada"),
            ("CA02", "British Columbia", "Canada"),
            ("CA03", "Manitoba", "Canada"),
            ("CA04", "New Brunswick", "Canada"),
            ("CALB", "Labrador", "Canada"),
            ("CANF", "Newfoundland", "Canada"),
            ("CA13", "Northwest Territories", "Canada"),
            ("CA07", "Nova Scotia", "Canada"),
            ("CA14", "Nunavut", "Canada"),
            ("CA08", "Ontario", "Canada"),
            ("CA09", "Prince Edward Island", "Canada"),
            ("CA10", "Québec", "Canada"),
            ("CA11", "Saskatchewan", "Canada"),
            ("CA12", "Yukon", "Canada"),
            ("GL", "Greenland", "Denmark"),
            ("SB", "St. Pierre and Miquelon", "France"),
        ]

        with open_fw(engine.format_filename(
                state_plant_checklist_file)) as write_object:
            csv_writer = open_csvw(write_object)
            for state_info in state_plant_checklist:
                file_name = state_info[1].replace(".", "").replace(
                    " ", "_").lower() + ".csv"
                file_name = "old_state_plant_checklist_" + file_name
                state_url = state_plant_checklist_base_url.format(
                    base=base_url, id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    for row in csv.reader(read_object, delimiter=","):
                        csv_writer.writerow([state_info[2]] + [state_info[1]] +
                                            row)

        data_path = self.engine.format_filename(state_plant_checklist_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("country", ("char", "7")),
            ("state", ("char", "23")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("national_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table,
                                      filename=state_plant_checklist_file)
        self.engine.insert_data_from_file(data_path)

        # NRCS State GSAT Lists
        base_url = "https://www.plants.usda.gov/"
        nrcs_state_gsat_base_url = "{base}java/gsatDownload?gsatid={id}"
        nrcs_state_gsat_file = "all_nrcs_state_gsat.csv"
        table_name = "nrcs_state_gsat"
        nrcs_state_gsat = [
            ("Alabama", "2"),
            ("Alaska", ""),
            ("Arkansas", ""),
            ("Arizona", "2"),
            ("California", ""),
            ("Colorado", ""),
            ("Connecticut", ""),
            ("Delaware", ""),
            ("Florida", ""),
            ("Georgia", ""),
            ("Hawaii", ""),
            ("Idaho", "9"),
            ("Illinois", ""),
            ("Indiana", ""),
            ("Iowa ", ""),
            ("Kansas", "6"),
            ("Kentucky", ""),
            ("Louisiana", "16"),
            ("Maine", ""),
            ("Maryland", ""),
            ("Massachusetts", ""),
            ("Michigan", ""),
            ("Minnesota", "11"),
            ("Mississippi", ""),
            ("Missouri", "14"),
            ("Montana", ""),
            ("Nebraska", "17"),
            ("Nevada", "4"),
            ("New Hampshire", ""),
            ("New Jersey ", ""),
            ("New Mexico", "1"),
            ("New York", ""),
            ("Noth Carolina", ""),
            ("North Dakota", "5"),
            ("Ohio", ""),
            ("Oklahoma", "12"),
            ("Oregon", "3"),
            ("Pennsylvania", "15"),
            ("Rhode Island", ""),
            ("South Carolina", ""),
            ("South Dakota", "7"),
            ("Tennessee", ""),
            ("Texas", "13"),
            ("Utah", ""),
            ("Vermont ", ""),
            ("Virginia", ""),
            ("Washington", "8"),
            ("West Virginia", ""),
            ("Wisconsin", ""),
            ("Wyoming", "10"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_gsat_file)) as write_object:
            for state_info in nrcs_state_gsat:
                if state_info[1]:
                    # skip states with no data ("state", ""),
                    file_name = state_info[0].replace(" ", "_").replace(
                        ".", "").lower() + ".csv"
                    file_name = "old_nrcs_state_gsat_" + file_name
                    state_url = nrcs_state_gsat_base_url.format(
                        base=base_url, id=state_info[1])
                    self.engine.download_file(state_url, filename=file_name)
                    with open_fr(
                            engine.format_filename(file_name)) as read_object:
                        # Read state file and only write the data minus header
                        next(read_object)
                        state_quoted = '"{state}",'.format(state=state_info[0])
                        for line in read_object:
                            write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_gsat_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "12")),
            ("symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("gsat_common_name", ("char", "93")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_gsat_file)
        self.engine.insert_data_from_file(data_path)

        base_url = "https://plants.sc.egov.usda.gov/"
        nrcs_state_plant_lists_url = "{base}java/nrcsStateDownload?statefips={id}"
        nrcs_state_plant_file = "all_nrcs_state_plant.csv"
        table_name = "nrcs_state_plant"
        nrcs_state_plant_lists = [
            ("01", "Alabama"),
            ("02", "Alaska"),
            ("05", "Arkansas"),
            ("04", "Arizona"),
            ("06", "California"),
            ("08", "Colorado"),
            ("09", "Connecticut"),
            ("10", "Delaware"),
            ("12", "Florida"),
            ("13", "Georgia"),
            ("15", "Hawaii"),
            ("16", "Idaho"),
            ("17", "Illinois"),
            ("18", "Indiana"),
            ("19", "Iowa"),
            ("20", "Kansas"),
            ("21", "Kentucky"),
            ("22", "Louisiana"),
            ("23", "Maine"),
            ("24", "Maryland"),
            ("25", "Massachusetts"),
            ("26", "Michigan"),
            ("27", "Minnesota"),
            ("28", "Mississippi"),
            ("29", "Missouri"),
            ("30", "Montana"),
            ("31", "Nebraska"),
            ("32", "Nevada"),
            ("33", "New Hampshire"),
            ("34", "New Jersey"),
            ("35", "New Mexico"),
            ("36", "New York"),
            ("37", "North Carolina"),
            ("38", "North Dakota"),
            ("39", "Ohio"),
            ("40", "Oklahoma"),
            ("41", "Oregon"),
            ("42", "Pennsylvania"),
            ("44", "Rhode Island"),
            ("45", "South Carolina"),
            ("46", "South Dakota"),
            ("47", "Tennessee"),
            ("48", "Texas"),
            ("49", "Utah"),
            ("50", "Vermont"),
            ("51", "Virginia"),
            ("53", "Washington"),
            ("54", "West Virginia"),
            ("55", "Wisconsin"),
            ("56", "Wyoming"),
            ("72", "Puerto Rico"),
            ("78", "Virgin Islands"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_plant_file)) as write_object:
            for state_info in nrcs_state_plant_lists:
                file_name = state_info[1].replace(" ", "_").replace(
                    ".", "").lower() + ".csv"
                file_name = "old_nrcs_state_plant_" + file_name
                state_url = nrcs_state_plant_lists_url.format(base=base_url,
                                                              id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    state_quoted = '"{state}",'.format(state=state_info[1])
                    for line in read_object:
                        write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_plant_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "17")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("state_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_plant_file)
        self.engine.insert_data_from_file(data_path)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        engine.download_files_from_archive(self.urls["capture"], archive_type="zip")

        # Convert xlsx to csv.
        xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx")
        file_path = self.engine.format_filename("DSD_CAPTURE.csv")
        book = xlrd.open_workbook(xlsx_file)
        sh = book.sheet_by_index(0)
        rows = sh.nrows

        # Creating data files
        new_data = open_fw(file_path)
        csv_writer = open_csvw(new_data)
        csv_writer.writerow(["Order", "Concept_id",
                             "Role_Type", "Codelist_id",
                             "Codelist_Code_id", "Description"])

        for index in range(2, rows):
            row = sh.row(index)
            # Get each row and format the sell value.
            # Data starts at index 2
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        new_data.close()

        file_names = [
            ('CL_FI_UNIT.csv', 'unit_data'),
            ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'),
            ('DSD_CAPTURE.csv', 'dsd_capture_data'),
            ('CL_FI_SPECIES_GROUPS.csv', 'species_group')
        ]

        for (filename, tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

        # File CL_FI_COUNTRY_GROUPS.csv has multi encoding
        file_names_encoded = [
            ('CL_FI_COUNTRY_GROUPS.csv', 'country_groups'),
        ]
        for (filename, tablename) in file_names_encoded:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            table.columns = [('UN_Code', ('int', )),
                             ('Identifier', ('int', )),
                             ('ISO2_Code', ('char', '5')),
                             ('ISO3_Code', ('char', '5')),
                             ('Name_En', ('char', '50')),
                             ('Name_Fr', ('char', '50')),
                             ('Name_Es', ('char', '50')),
                             ('Name_Ar', ('char', '120')),
                             ('Name_Cn', ('char', '90')),
                             ('Name_Ru', ('char', '150')),
                             ('Official_Name_En', ('char', '70')),
                             ('Official_Name_Fr', ('char', '70')),
                             ('Official_Name_Es', ('char', '70')),
                             ('Official_Name_Ar', ('char', '1100')),
                             ('Official_Name_Cn', ('char', '70')),
                             ('Official_Name_Ru', ('char', '130')),
                             ('Continent_Group', ('char', '15')),
                             ('EcoClass_Group', ('char', '50')),
                             ('GeoRegion_Group', ('char', '30'))]
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

            # TS_FI_CAPTURE is
            file_names_encoded = [
                ('TS_FI_CAPTURE.csv', 'ts_capture_data',)
            ]
            for (filename, tablename) in file_names_encoded:
                data_path = self.engine.format_filename(filename)
                table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
                table.columns = [('COUNTRY', ('int', )),
                                 ('FISHING_AREA', ('int', )),
                                 ('SPECIES', ('char', '10')),
                                 ('YEAR', ('int', )),
                                 ('UNIT', ('char', '5')),
                                 ('QUANTITY', ('double', )),
                                 ('SYMBOL', ('char', '4'))]
                self.engine.auto_create_table(table, filename=filename)
                self.engine.insert_data_from_file(data_path)