Beispiel #1
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        for key in self.urls:
            original_file_name = "trade_prdct_{}.txt".format(key)
            new_file_name = "trade_prdct_{}.csv".format(key)

            engine.download_file(self.urls[key], original_file_name)

            old_path = self.engine.format_filename(original_file_name)
            new_path = self.engine.format_filename(new_file_name)

            # Re-write the file with one delimeter
            old_data = open_fr(old_path)
            new_data = open_fw(new_path)

            # Read header line and convert "," to "|"
            line1 = old_data.readline().strip().replace(",", "|")
            new_data.write(line1 + "\n")
            for line in old_data:
                # Remove leading "|" from the data
                new_data.write(line.strip("|"))
            new_data.close()
            old_data.close()
            table = Table(key, delimiter="|")
            engine.auto_create_table(table, filename=new_file_name)
            engine.insert_data_from_file(new_path)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key],
                                      self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(
                self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(
                key, cleanup=self.cleanup_func_table),
                                          filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)
def json2csv(input_file,
             output_file=None,
             header_values=None,
             encoding=ENCODING):
    """Convert Json file to CSV.

    Function is used for only testing and can handle the file of the size.
    """
    file_out = open_fr(input_file, encoding=encoding)
    # set output file name and write header
    if output_file is None:
        output_file = os.path.splitext(
            os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(output_file, encoding=encoding)
    if os.name == 'nt':
        outfile = csv.DictWriter(csv_out,
                                 dialect='excel',
                                 escapechar="\\",
                                 lineterminator='\n',
                                 fieldnames=header_values)
    else:
        outfile = csv.DictWriter(csv_out,
                                 dialect='excel',
                                 escapechar="\\",
                                 fieldnames=header_values)
    raw_data = json.loads(file_out.read())
    outfile.writeheader()

    for item in raw_data:
        outfile.writerow(item)
    file_out.close()
    subprocess.call(['rm', '-r', input_file])
    return output_file
Beispiel #4
0
 def create_table(self):
     """Create the table by creating an empty XML file."""
     self.output_file = open_fw(self.table_name())
     self.output_file.write(u'<?xml version="1.0" encoding="UTF-8"?>')
     self.output_file.write(u'\n<root>')
     self.table_names.append((self.output_file, self.table_name()))
     self.auto_column_number = 1
Beispiel #5
0
def xml2csv_test(input_file,
                 outputfile=None,
                 header_values=None,
                 row_tag="row"):
    """Convert xml to csv.

    Function is used for only testing and can handle the file of the size.
    """
    file_output = open_fr(input_file, encoding=ENCODING)
    # set output file name and write header
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(outputfile)
    if os.name == 'nt':
        csv_writer = csv.writer(csv_out,
                                dialect='excel',
                                escapechar='\\',
                                lineterminator='\n')
    else:
        csv_writer = csv.writer(csv_out, dialect='excel', escapechar='\\')

    v = file_output.read()
    csv_writer.writerow(header_values)
    tree = ET.parse(NewFile(v))
    root = tree.getroot()
    for rows in root.findall(row_tag):
        x = [
            name.text for name in header_values for name in rows.findall(name)
        ]
        csv_writer.writerow(x)
    file_output.close()
    subprocess.call(['rm', '-r', input_file])
    return outputfile
Beispiel #6
0
def write_out_scripts(script_dict, path, out_path):
    """Writes scripts out to a given path"""
    names = os.path.basename(path)
    file_name = names.lower().replace("-", "_") + ".json"
    path_dir = get_directory(os.path.expanduser(path))

    if out_path is not None:
        path_dir = os.path.expanduser(out_path)
        if not os.path.exists(path_dir):
            os.mkdir(path_dir)

    write_path = os.path.join(path_dir, file_name)

    if not (script_dict and "resources" in script_dict):
        print(write_path + " creation skipped because resources were empty.")
        return
    if os.path.exists(write_path):
        choice = clean_input(write_path +
                             " already exists. Overwrite the script? [y/n]")
        if choice == "n":
            print(write_path + " creation skipped.")
            return
    try:
        with open_fw(write_path) as output_path:
            sorted_dict = collections.OrderedDict(script_dict.items())
            json_str = json.dumps(sorted_dict, sort_keys=True, indent=4)
            output_path.write(json_str)
            print("Successfully wrote scripts to " +
                  os.path.abspath(write_path))
            output_path.close()
    except Exception as error:
        print(write_path + " could not be created. {}".format(error.message))
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        for key in self.urls:
            original_file_name = "trade_prdct_{}.txt".format(key)
            new_file_name = "trade_prdct_{}.csv".format(key)

            engine.download_file(self.urls[key], original_file_name)

            old_path = self.engine.format_filename(original_file_name)
            new_path = self.engine.format_filename(new_file_name)

            # Re-write the file with one delimeter
            old_data = open_fr(old_path)
            new_data = open_fw(new_path)

            # Read header line and convert "," to "|"
            line1 = old_data.readline().strip().replace(",", "|")
            new_data.write(line1 + "\n")
            for line in old_data:
                # Remove leading "|" from the data
                new_data.write(line.strip("|"))
            new_data.close()
            old_data.close()
            table = Table(key, delimiter="|")
            engine.auto_create_table(table, filename=new_file_name)
            engine.insert_data_from_file(new_path)
Beispiel #8
0
    def to_csv(self, sort=True, path=None):
        # Due to Cyclic imports we can not move this import to the top
        from retriever.lib.engine_tools import sort_csv

        for table_name in self.script_table_registry[self.script.name]:

            csv_file_output = os.path.normpath(
                os.path.join(path if path else '', table_name[0] + '.csv'))
            csv_file = open_fw(csv_file_output)
            csv_writer = open_csvw(csv_file)
            self.get_cursor()
            self.set_engine_encoding()
            self.cursor.execute("SELECT * FROM  {};".format(table_name[0]))
            row = self.cursor.fetchone()
            column_names = [
                u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description
            ]
            csv_writer.writerow(column_names)
            while row is not None:
                csv_writer.writerow(row)
                row = self.cursor.fetchone()
            csv_file.close()
            if sort:
                sort_csv(csv_file_output)
        self.disconnect()
Beispiel #9
0
 def create_table(self):
     """Create the table by creating an empty csv file"""
     self.auto_column_number = 1
     self.file = open_fw(self.table_name())
     self.output_file = open_csvw(self.file)
     self.output_file.writerow([u'{}'.format(val) for val in self.table.get_insert_columns(join=False, create=True)])
     self.table_names.append((self.file, self.table_name()))
Beispiel #10
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        self.engine.download_file(
            self.urls["main"], "Succession_sampling_03-07_data_original.txt")
        data_path = self.engine.format_filename(
            "Succession_sampling_03-07_data.txt")
        old_data = open_fr(
            self.engine.find_file(
                "Succession_sampling_03-07_data_original.txt"))
        new_data = open_fw(data_path)
        # original file's header contains an end of line charactor in the middle hence creating two lines
        # Read in the two lines and create the full header
        line1 = old_data.readline().strip()
        line2 = old_data.readline()
        newline = line1 + "\t" + line2
        new_data.write(newline)
        for line in old_data:
            new_data.write(line)
        new_data.close()
        old_data.close()

        self.engine.auto_create_table(
            self.tables["main"], filename="Succession_sampling_03-07_data.txt")
        self.engine.insert_data_from_file(data_path)
Beispiel #11
0
    def create_table(self):
        """Create the table by creating an empty csv file"""
        self.auto_column_number = 1
        table_path = os.path.join(self.opts["data_dir"], self.table_name())
        self.file = open_fw(table_path)
        self.output_file = open_csvw(self.file)
        column_list = self.table.get_insert_columns(join=False, create=True)
        self.output_file.writerow([u'{}'.format(val) for val in column_list])
        self.table_names.append((self.file, table_path))

        # Register all tables created to enable
        # testing python files having custom download function
        Engine.register_tables(self)
Beispiel #12
0
    def create_table(self):
        """Create the table by creating an empty json file"""
        self.output_file = open_fw(self.table_name())
        self.output_file.write("[")
        self.table_names.append((self.output_file, self.table_name()))
        self.auto_column_number = 1

        # Register all tables created to enable
        # testing python files having custom download function
        if self.script.name not in self.script_table_registry:
            self.script_table_registry[self.script.name] = []
        self.script_table_registry[self.script.name].append(
            (self.table_name(), self.table))
Beispiel #13
0
    def create_table(self):
        """Create the table by creating an empty json file"""
        table_path = os.path.join(self.opts["data_dir"], self.table_name())
        self.output_file = open_fw(table_path, encoding=self.encoding)
        self.output_file.write("[")
        self.table_names.append((self.output_file, table_path))
        self.auto_column_number = 1

        # Register all tables created to enable
        # testing python files having custom download function
        if self.script.name not in self.script_table_registry:
            self.script_table_registry[self.script.name] = []
        self.script_table_registry[self.script.name].append(
            (self.table_name(), self.table))
Beispiel #14
0
    def create_table(self):
        """Create the table by creating an empty XML file."""
        self.output_file = open_fw(self.table_name())
        self.output_file.write(u'<?xml version="1.0" encoding="UTF-8"?>')
        self.output_file.write(u'\n<root>')
        self.table_names.append((self.output_file, self.table_name()))
        self.auto_column_number = 1

        # Register all tables created to enable
        # testing python files having custom download function
        if self.script.name not in self.script_table_registry:
            self.script_table_registry[self.script.name] = []
        self.script_table_registry[self.script.name].append(
            (self.table_name(), self.table))
Beispiel #15
0
def sort_csv(filename, encoding=ENCODING):
    """Sort CSV rows minus the header and return the file.

    Function is used for only testing and can handle the file of the size.
    """
    filename = os.path.normpath(filename)
    input_file = open_fr(filename, encoding)
    csv_reader_infile = csv.reader(input_file, escapechar="\\")
    #  write the data to a temporary file and sort it
    temp_path = os.path.normpath("tempfile")
    temp_file = open_fw(temp_path, encoding)

    csv_writer = open_csvw(temp_file)
    i = 0
    infields = None
    for row in csv_reader_infile:
        if i == 0:
            # The first entry is the header line
            infields = row
            i += 1
        else:
            csv_writer.writerow(row)
    input_file.close()
    temp_file.close()

    # sort the temp file
    sorted_txt = sort_file(temp_path, encoding)
    tmp = open_fr(sorted_txt, encoding)
    in_txt = csv.reader(tmp, delimiter=',', escapechar="\\")
    csv_file = open_fw(filename, encoding)
    csv_writer = open_csvw(csv_file)
    csv_writer.writerow(infields)
    csv_writer.writerows(in_txt)
    tmp.close()
    csv_file.close()
    os.remove(os.path.normpath(temp_path))
    return filename
Beispiel #16
0
def sort_file(file_path, encoding=ENCODING):
    """Sort file by line and return the file.

    Function is used for only testing and can handle the file of the size.
    """
    file_path = os.path.normpath(file_path)
    input_file = open_fr(file_path, encoding)
    lines = [line.strip() for line in input_file]
    input_file.close()
    outfile = open_fw(file_path, encoding)
    lines.sort()
    for line in lines:
        outfile.write(line + "\n")
    outfile.close()
    return file_path
Beispiel #17
0
    def create_table(self):
        """Create the table by creating an empty csv file"""
        self.auto_column_number = 1
        self.file = open_fw(self.table_name())
        self.output_file = open_csvw(self.file)
        column_list = self.table.get_insert_columns(join=False, create=True)
        self.output_file.writerow([u'{}'.format(val) for val in column_list])
        self.table_names.append((self.file, self.table_name()))

        # Register all tables created to enable
        # testing python files having custom download function
        if self.script.name not in self.script_table_registry:
            self.script_table_registry[self.script.name] = []
        self.script_table_registry[self.script.name].append(
            (self.table_name(), self.table))
Beispiel #18
0
    def create_table(self):
        """Create the table by creating an empty json file"""
        table_path = os.path.join(self.opts["data_dir"], self.table_name())
        self.output_file = open_fw(table_path)
        self.output_file.write("[")
        self.table_names.append((self.output_file, table_path))
        self.auto_column_number = 1

        # Register all tables created to enable
        # testing python files having custom download function
        if self.script.name not in self.script_table_registry:
            self.script_table_registry[self.script.name] = []
        self.script_table_registry[self.script.name].append(
            (self.table_name(), self.table)
        )
Beispiel #19
0
    def disconnect(self):
        """Close out the JSON with a `\\n]}` and close the file.

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a `\\n]}`.
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',\n')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.writelines(['\n]'])
                current_output_file.close()
            self.table_names = []
Beispiel #20
0
    def disconnect(self):
        """Close out the JSON with a `\\n]}` and close the file.

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a `\\n]}`.
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',\n')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.writelines(['\n]'])
                current_output_file.close()
            self.table_names = []
Beispiel #21
0
    def disconnect(self):
        """Close out the xml files

        Close all the file objects that have been created
        Re-write the files stripping off the last comma and then close with a closing tag)
        """
        if self.table_names:
            for output_file_i, file_name in self.table_names:
                output_file_i.close()
                current_input_file = open_fr(file_name)
                file_contents = current_input_file.readlines()
                current_input_file.close()
                file_contents[-1] = file_contents[-1].strip(',')
                current_output_file = open_fw(file_name)
                current_output_file.writelines(file_contents)
                current_output_file.write(u'\n</root>')
                current_output_file.close()
            self.table_names = []
Beispiel #22
0
    def to_csv(self,
               sort=True,
               path=None,
               select_columns=None,
               select_table=None):
        """Create a CSV file from the a data store.

        sort flag to create a sorted file,
        path to write the flag else write to the PWD,
        select_columns flag is used by large files to select
        columns data and has SELECT LIMIT 3.
        """
        # Due to Cyclic imports we can not move this import to the top
        from retriever.lib.engine_tools import sort_csv

        for table_name in self.script_table_registry[self.script.name]:

            csv_file_output = os.path.normpath(
                os.path.join(path if path else '', table_name[0] + '.csv'))
            self.get_cursor()
            self.set_engine_encoding()
            csv_file = open_fw(csv_file_output, encoding=self.encoding)
            csv_writer = open_csvw(csv_file)

            limit = ""
            cols = "*"
            if select_columns:
                limit = "LIMIT 3"
                cols = ",".join(select_columns)
            sql_query = "SELECT {cols} FROM  {tab} {limit};"
            self.cursor.execute(
                sql_query.format(cols=cols, tab=table_name[0], limit=limit))
            row = self.cursor.fetchone()
            column_names = [
                u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description
            ]
            csv_writer.writerow(column_names)
            while row is not None:
                csv_writer.writerow(row)
                row = self.cursor.fetchone()
            csv_file.close()
            if sort:
                sort_csv(csv_file_output)
        self.disconnect()
def convert_to_csv(dir_name):
    """Change the file delimiter to comma delimiter"""
    for file_name in os.listdir(dir_name):
        file_path = os.path.join(dir_name, file_name)
        if file_path.endswith(".txt"):
            csv_file_name = file_name.replace(".txt", ".csv").lower()
            output_file = os.path.join(dir_name, csv_file_name)
            with open_fr(file_path,
                         encoding="latin-1") as read_object, open_fw(
                             output_file) as outputfw:
                fr = csv.reader(read_object, delimiter="^", quotechar="~")
                fw = csv.writer(outputfw,
                                delimiter=",",
                                quoting=csv.QUOTE_MINIMAL)
                for line in fr:
                    if line:
                        fw.writerow(line)
            # delete the text files
            os.remove(file_path)
Beispiel #24
0
 def to_csv(self):
     # Due to Cyclic imports we can not move this import to the top
     from retriever.lib.engine_tools import sort_csv
     for _ in list(self.script.urls.keys()):
         table_name = self.table_name()
         csv_file_output = os.path.normpath(table_name + '.csv')
         csv_file = open_fw(csv_file_output)
         csv_writer = open_csvw(csv_file)
         self.get_cursor()
         self.set_engine_encoding()
         self.cursor.execute("SELECT * FROM  {};".format(table_name))
         row = self.cursor.fetchone()
         colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description]
         csv_writer.writerow(colnames)
         while row is not None:
             csv_writer.writerow(row)
             row = self.cursor.fetchone()
         csv_file.close()
         sort_csv(csv_file_output)
     self.disconnect()
Beispiel #25
0
 def to_csv(self, sort=True):
     # Due to Cyclic imports we can not move this import to the top
     from retriever.lib.engine_tools import sort_csv
     for table_n in list(self.script.tables.keys()):
         table_name = self.table_name(name=table_n)
         csv_file_output = os.path.normpath(table_name + '.csv')
         csv_file = open_fw(csv_file_output)
         csv_writer = open_csvw(csv_file)
         self.get_cursor()
         self.set_engine_encoding()
         self.cursor.execute("SELECT * FROM  {};".format(table_name))
         row = self.cursor.fetchone()
         colnames = [u'{}'.format(tuple_i[0]) for tuple_i in self.cursor.description]
         csv_writer.writerow(colnames)
         while row is not None:
             csv_writer.writerow(row)
             row = self.cursor.fetchone()
         csv_file.close()
         if sort:
             sort_csv(csv_file_output)
     self.disconnect()
Beispiel #26
0
def json2csv(input_file,
             output_file=None,
             header_values=None,
             encoding=ENCODING,
             row_key=None):
    """Convert Json file to CSV."""
    file_out = open_fr(input_file, encoding=encoding)
    # set output file name and write header
    if output_file is None:
        output_file = os.path.splitext(
            os.path.basename(input_file))[0] + ".csv"
    csv_out = open_fw(output_file, encoding=encoding)
    if os.name == 'nt':
        outfile = csv.writer(csv_out,
                             dialect='excel',
                             escapechar="\\",
                             lineterminator='\n')
    else:
        outfile = csv.writer(csv_out, dialect='excel', escapechar="\\")

    raw_data = json.loads(file_out.read(), object_pairs_hook=OrderedDict)

    raw_data, header_values = walker(raw_data,
                                     row_key=row_key,
                                     header_values=header_values,
                                     rows=[],
                                     normalize=False)

    if isinstance(raw_data[0], dict):
        # row values are in a list of dictionaries
        raw_data = [list(row.values()) for row in raw_data]
    else:
        raw_data = [row.tolist() for row in raw_data]
    if header_values:
        outfile.writerow(header_values)
    outfile.writerows(raw_data)
    file_out.close()
    subprocess.call(['rm', '-r', input_file])
    return output_file
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        original_data = "Succession_sampling_03-07_data_original.txt"
        new_date = "Succession_sampling_03-07_data.txt"
        self.engine.download_file(self.urls["main"], original_data)
        data_path = self.engine.format_filename(new_date)
        old_data = open_fr(self.engine.find_file(original_data))
        new_data = open_fw(data_path)
        # original file's header contains an end of line charactor
        #  in the middle hence creating two lines
        # Read in the two lines and create the full header
        line1 = old_data.readline().strip()
        line2 = old_data.readline()
        newline = line1 + "\t" + line2
        new_data.write(newline)
        for line in old_data:
            new_data.write(line)
        new_data.close()
        old_data.close()

        self.engine.auto_create_table(self.tables["main"], filename=new_date)
        self.engine.insert_data_from_file(data_path)
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        for key in self.urls:
            self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1])
            new_file_path = self.engine.format_filename("new" + key)
            old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]))
            new_data = open_fw(new_file_path)
            with old_data as file_block:

                # after the metadata lines, set data to True
                data = False
                for lines in file_block.readlines():
                    # meta data contins line with no ";" and may have "(;;;;)+" or empty lines
                    if not data and (";" not in lines or ";;;;" in lines):
                        pass
                    else:
                        data = True
                        new_data.write(lines)
            file_block.close()
            new_data.close()
            self.engine.auto_create_table(Table(key,
                                                cleanup=self.cleanup_func_table), filename=str("new" + key))
            self.engine.insert_data_from_file(new_file_path)
Beispiel #29
0
    def to_csv(self, sort=True, path=None, select_columns=None):
        """Create a CSV file from the a data store.

        sort flag to create a sorted file,
        path to write the flag else write to the PWD,
        select_columns flag is used by large files to select
        columns data and has SELECT LIMIT 3.
        """
        # Due to Cyclic imports we can not move this import to the top
        from retriever.lib.engine_tools import sort_csv

        for table_name in self.script_table_registry[self.script.name]:

            csv_file_output = os.path.normpath(os.path.join(path if path else '',
                                                            table_name[0] + '.csv'))
            csv_file = open_fw(csv_file_output)
            csv_writer = open_csvw(csv_file)
            self.get_cursor()
            self.set_engine_encoding()
            limit = ""
            cols = "*"
            if select_columns:
                limit = "LIMIT 3"
                cols = ",".join(select_columns)
            sql_query = "SELECT {cols} FROM  {tab} {limit};"
            self.cursor.execute(sql_query.format(cols=cols, tab=table_name[0], limit=limit))
            row = self.cursor.fetchone()
            column_names = [u'{}'.format(tuple_i[0])
                            for tuple_i in self.cursor.description]
            csv_writer.writerow(column_names)
            while row is not None:
                csv_writer.writerow(row)
                row = self.cursor.fetchone()
            csv_file.close()
            if sort:
                sort_csv(csv_file_output)
        self.disconnect()
Beispiel #30
0
 def create_table(self):
     """Create the table by creating an empty json file"""
     self.output_file = open_fw(self.table_name())
     self.output_file.write("[")
     self.table_names.append((self.output_file, self.table_name()))
     self.auto_column_number = 1
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        engine.download_files_from_archive(self.urls["capture"], archive_type="zip")

        # Convert xlsx to csv.
        xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx")
        file_path = self.engine.format_filename("DSD_CAPTURE.csv")
        book = xlrd.open_workbook(xlsx_file)
        sh = book.sheet_by_index(0)
        rows = sh.nrows

        # Creating data files
        new_data = open_fw(file_path)
        csv_writer = open_csvw(new_data)
        csv_writer.writerow(["Order", "Concept_id",
                             "Role_Type", "Codelist_id",
                             "Codelist_Code_id", "Description"])

        for index in range(2, rows):
            row = sh.row(index)
            # Get each row and format the sell value.
            # Data starts at index 2
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        new_data.close()

        file_names = [
            ('CL_FI_UNIT.csv', 'unit_data'),
            ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'),
            ('DSD_CAPTURE.csv', 'dsd_capture_data'),
            ('CL_FI_SPECIES_GROUPS.csv', 'species_group')
        ]

        for (filename, tablename) in file_names:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

        # File CL_FI_COUNTRY_GROUPS.csv has multi encoding
        file_names_encoded = [
            ('CL_FI_COUNTRY_GROUPS.csv', 'country_groups'),
        ]
        for (filename, tablename) in file_names_encoded:
            data_path = self.engine.format_filename(filename)
            table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
            table.columns = [('UN_Code', ('int', )),
                             ('Identifier', ('int', )),
                             ('ISO2_Code', ('char', '5')),
                             ('ISO3_Code', ('char', '5')),
                             ('Name_En', ('char', '50')),
                             ('Name_Fr', ('char', '50')),
                             ('Name_Es', ('char', '50')),
                             ('Name_Ar', ('char', '120')),
                             ('Name_Cn', ('char', '90')),
                             ('Name_Ru', ('char', '150')),
                             ('Official_Name_En', ('char', '70')),
                             ('Official_Name_Fr', ('char', '70')),
                             ('Official_Name_Es', ('char', '70')),
                             ('Official_Name_Ar', ('char', '1100')),
                             ('Official_Name_Cn', ('char', '70')),
                             ('Official_Name_Ru', ('char', '130')),
                             ('Continent_Group', ('char', '15')),
                             ('EcoClass_Group', ('char', '50')),
                             ('GeoRegion_Group', ('char', '30'))]
            self.engine.auto_create_table(table, filename=filename)
            self.engine.insert_data_from_file(data_path)

            # TS_FI_CAPTURE is
            file_names_encoded = [
                ('TS_FI_CAPTURE.csv', 'ts_capture_data',)
            ]
            for (filename, tablename) in file_names_encoded:
                data_path = self.engine.format_filename(filename)
                table = Table(tablename, delimiter=',', cleanup=self.cleanup_func_table)
                table.columns = [('COUNTRY', ('int', )),
                                 ('FISHING_AREA', ('int', )),
                                 ('SPECIES', ('char', '10')),
                                 ('YEAR', ('int', )),
                                 ('UNIT', ('char', '5')),
                                 ('QUANTITY', ('double', )),
                                 ('SYMBOL', ('char', '4'))]
                self.engine.auto_create_table(table, filename=filename)
                self.engine.insert_data_from_file(data_path)
Beispiel #32
0
from retriever.lib.defaults import ENCODING

encoding = ENCODING.lower()

from retriever.lib.defaults import VERSION, COPYRIGHT
from retriever.lib.scripts import SCRIPT_LIST, reload_scripts
from retriever.lib.tools import open_fw
from retriever.lib.repository import check_for_updates


def to_str(object, object_encoding=encoding):
    return str(object).encode('UTF-8').decode(encoding)


# Create the .rst file for the available datasets
datasetfile = open_fw("datasets_list.rst")
datasetfile_title = """==================
Datasets Available
==================


"""
check_for_updates()
reload_scripts()
script_list = SCRIPT_LIST()

# write the title of dataset rst file
# ref:http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
datasetfile.write(datasetfile_title)

# get info from the scripts using specified encoding
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=9)

            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes",
                          pk=False,
                          header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"
                }
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int", )),
                             ("regioncode", ("int", )),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns = [("record_id", ("pk-auto", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("Year", ("int", )),
                             ("Aou", ("int", )), ("Count10", ("int", )),
                             ("Count20", ("int", )), ("Count30", ("int", )),
                             ("Count40", ("int", )), ("Count50", ("int", )),
                             ("StopTotal", ("int", )),
                             ("SpeciesTotal", ("int", ))]

            stateslist = [
                "Alabama", "Alaska", "Arizona", "Arkansas", "California",
                "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
                "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
                "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
                "Nevada", ["New Hampshire", "NHampsh"],
                ["New Jersey", "NJersey"], ["New Mexico", "NMexico"],
                ["New York", "NYork"], ["North Carolina", "NCaroli"],
                ["North Dakota",
                 "NDakota"], "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
                ["Rhode Island", "RhodeIs"], ["South Carolina", "SCaroli"],
                ["South Dakota", "SDakota"], "Tennessee", "Texas", "Utah",
                "Vermont", "Virginia", "Washington",
                ["West Virginia",
                 "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                ["British Columbia", "BritCol"], "Manitoba",
                ["New Brunswick", "NBrunsw"],
                ["Northwest Territories", "NWTerri"], "Newfoundland",
                ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                ["Prince Edward Island",
                 "PEI"], "Quebec", "Saskatchewan", "Yukon"
            ]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if len(state) > 2:
                        shortstate = state[0:7]
                    else:
                        state, shortstate = state[0], state[1]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])
                    except:
                        print("Failed bulk insert on " + state +
                              ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine
Beispiel #34
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = ["observation_id",
                         "update_datetime",
                         "site_id",
                         "latitude",
                         "longitude",
                         "elevation_in_meters",
                         "state",
                         "species_id",
                         "genus",
                         "species",
                         "common_name",
                         "kingdom",
                         "individual_id",
                         "phenophase_id",
                         "phenophase_description",
                         "observation_date",
                         "day_of_year",
                         "phenophase_status",
                         "intensity_category_id",
                         "intensity_value",
                         "abundance_value"
                         ]

        columns = [("record_id", ("pk-auto",)),
                   ("observation_id", ("int",)),  # subsequently refered to as "status record"
                   ("update_datetime", ("char",)),
                   ("site_id", ("int",)),
                   ("latitude", ("double",)),
                   ("longitude", ("double",)),
                   ("elevation_in_meters", ("char",)),
                   ("state", ("char",)),
                   ("species_id", ("int",)),
                   ("genus", ("char",)),
                   ("species", ("char",)),
                   ("common_name", ("char",)),
                   ("kingdom", ("char",)),  # skip kingdom
                   ("individual_id", ("char",)),
                   ("phenophase_id", ("int",)),
                   ("phenophase_description", ("char",)),
                   ("observation_date", ("char",)),
                   ("day_of_year", ("char",)),
                   ("phenophase_status", ("char",)),
                   ("intensity_category_id", ("char",)),
                   ("intensity_value", ("char",)),
                   ("abundance_value", ("char",))
                   ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date), endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(), key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations', delimiter=',', pk='record_id', contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine
Beispiel #35
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # Complete Plants Checklist
        file_name = "complete_plant_checklist.csv"
        table_name = "complete_plant_checklist"
        complete_plant_url = "https://plants.sc.egov.usda.gov/java/downloadData?fileName=plantlst.txt&static=true"
        self.engine.download_file(complete_plant_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("common_name", ("char", "42")),
            ("family", ("char", "30")),
        ]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # Symbols for Unknown Plants
        file_name = "symbols_unknown_plants.csv"
        table_name = "unknown_plants"
        unknown_plants_url = "https://plants.sc.egov.usda.gov/Data/unknown_plants.txt"
        self.engine.download_file(unknown_plants_url, filename=file_name)
        data_path = self.engine.format_filename(file_name)
        table = Table(table_name, delimiter=",")
        table.columns = [("symbol", ("char", "7")),
                         ("common_name", ("char", "56"))]
        self.engine.auto_create_table(table, filename=file_name)
        self.engine.insert_data_from_file(data_path)

        # State PLANTS Checklist
        base_url = "https://plants.sc.egov.usda.gov/"
        state_plant_checklist_base_url = "{base}java/stateDownload?statefips={id}"
        state_plant_checklist_file = "all_state_plant_checklist.csv"
        table_name = "state_plant_checklist"
        state_plant_checklist = [
            ("US01", "Alabama", "US"),
            ("US02", "Alaska", "US"),
            ("US05", "Arkansas", "US"),
            ("US04", "Arizona", "US"),
            ("US06", "California", "US"),
            ("US08", "Colorado", "US"),
            ("US09", "Connecticut", "US"),
            ("US10", "Delaware", "US"),
            ("US11", "District of Columbia", "US"),
            ("US12", "Florida", "US"),
            ("US13", "Georgia", "US"),
            ("US15", "Hawaii", "US"),
            ("US16", "Idaho", "US"),
            ("US17", "Illinois", "US"),
            ("US18", "Indiana", "US"),
            ("US19", "Iowa", "US"),
            ("US20", "Kansas", "US"),
            ("US21", "Kentucky", "US"),
            ("US22", "Louisiana", "US"),
            ("US23", "Maine", "US"),
            ("US24", "Maryland", "US"),
            ("US25", "Massachusetts", "US"),
            ("US26", "Michigan", "US"),
            ("US27", "Minnesota", "US"),
            ("US28", "Mississippi", "US"),
            ("US29", "Missouri", "US"),
            ("US30", "Montana", "US"),
            ("US31", "Nebraska", "US"),
            ("US32", "Nevada", "US"),
            ("US33", "New Hampshire", "US"),
            ("US34", "New Jersey", "US"),
            ("US35", "New Mexico", "US"),
            ("US36", "New York", "US"),
            ("US37", "North Carolina", "US"),
            ("US38", "North Dakota", "US"),
            ("US39", "Ohio", "US"),
            ("US40", "Oklahoma", "US"),
            ("US41", "Oregon", "US"),
            ("US42", "Pennsylvania", "US"),
            ("US44", "Rhode Island", "US"),
            ("US45", "South Carolina", "US"),
            ("US46", "South Dakota", "US"),
            ("US47", "Tennessee", "US"),
            ("US48", "Texas", "US"),
            ("US49", "Utah", "US"),
            ("US50", "Vermont", "US"),
            ("US51", "Virginia", "US"),
            ("US53", "Washington", "US"),
            ("US54", "West Virginia", "US"),
            ("US55", "Wisconsin", "US"),
            ("US56", "Wyoming", "US"),
            ("US72", "Puerto Rico", "US"),
            ("US78", "Virgin Islands", "US"),
            ("CA01", "Alberta", "Canada"),
            ("CA02", "British Columbia", "Canada"),
            ("CA03", "Manitoba", "Canada"),
            ("CA04", "New Brunswick", "Canada"),
            ("CALB", "Labrador", "Canada"),
            ("CANF", "Newfoundland", "Canada"),
            ("CA13", "Northwest Territories", "Canada"),
            ("CA07", "Nova Scotia", "Canada"),
            ("CA14", "Nunavut", "Canada"),
            ("CA08", "Ontario", "Canada"),
            ("CA09", "Prince Edward Island", "Canada"),
            ("CA10", "Québec", "Canada"),
            ("CA11", "Saskatchewan", "Canada"),
            ("CA12", "Yukon", "Canada"),
            ("GL", "Greenland", "Denmark"),
            ("SB", "St. Pierre and Miquelon", "France"),
        ]

        with open_fw(engine.format_filename(
                state_plant_checklist_file)) as write_object:
            csv_writer = open_csvw(write_object)
            for state_info in state_plant_checklist:
                file_name = state_info[1].replace(".", "").replace(
                    " ", "_").lower() + ".csv"
                file_name = "old_state_plant_checklist_" + file_name
                state_url = state_plant_checklist_base_url.format(
                    base=base_url, id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    for row in csv.reader(read_object, delimiter=","):
                        csv_writer.writerow([state_info[2]] + [state_info[1]] +
                                            row)

        data_path = self.engine.format_filename(state_plant_checklist_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("country", ("char", "7")),
            ("state", ("char", "23")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("national_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table,
                                      filename=state_plant_checklist_file)
        self.engine.insert_data_from_file(data_path)

        # NRCS State GSAT Lists
        base_url = "https://www.plants.usda.gov/"
        nrcs_state_gsat_base_url = "{base}java/gsatDownload?gsatid={id}"
        nrcs_state_gsat_file = "all_nrcs_state_gsat.csv"
        table_name = "nrcs_state_gsat"
        nrcs_state_gsat = [
            ("Alabama", "2"),
            ("Alaska", ""),
            ("Arkansas", ""),
            ("Arizona", "2"),
            ("California", ""),
            ("Colorado", ""),
            ("Connecticut", ""),
            ("Delaware", ""),
            ("Florida", ""),
            ("Georgia", ""),
            ("Hawaii", ""),
            ("Idaho", "9"),
            ("Illinois", ""),
            ("Indiana", ""),
            ("Iowa ", ""),
            ("Kansas", "6"),
            ("Kentucky", ""),
            ("Louisiana", "16"),
            ("Maine", ""),
            ("Maryland", ""),
            ("Massachusetts", ""),
            ("Michigan", ""),
            ("Minnesota", "11"),
            ("Mississippi", ""),
            ("Missouri", "14"),
            ("Montana", ""),
            ("Nebraska", "17"),
            ("Nevada", "4"),
            ("New Hampshire", ""),
            ("New Jersey ", ""),
            ("New Mexico", "1"),
            ("New York", ""),
            ("Noth Carolina", ""),
            ("North Dakota", "5"),
            ("Ohio", ""),
            ("Oklahoma", "12"),
            ("Oregon", "3"),
            ("Pennsylvania", "15"),
            ("Rhode Island", ""),
            ("South Carolina", ""),
            ("South Dakota", "7"),
            ("Tennessee", ""),
            ("Texas", "13"),
            ("Utah", ""),
            ("Vermont ", ""),
            ("Virginia", ""),
            ("Washington", "8"),
            ("West Virginia", ""),
            ("Wisconsin", ""),
            ("Wyoming", "10"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_gsat_file)) as write_object:
            for state_info in nrcs_state_gsat:
                if state_info[1]:
                    # skip states with no data ("state", ""),
                    file_name = state_info[0].replace(" ", "_").replace(
                        ".", "").lower() + ".csv"
                    file_name = "old_nrcs_state_gsat_" + file_name
                    state_url = nrcs_state_gsat_base_url.format(
                        base=base_url, id=state_info[1])
                    self.engine.download_file(state_url, filename=file_name)
                    with open_fr(
                            engine.format_filename(file_name)) as read_object:
                        # Read state file and only write the data minus header
                        next(read_object)
                        state_quoted = '"{state}",'.format(state=state_info[0])
                        for line in read_object:
                            write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_gsat_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "12")),
            ("symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("gsat_common_name", ("char", "93")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_gsat_file)
        self.engine.insert_data_from_file(data_path)

        base_url = "https://plants.sc.egov.usda.gov/"
        nrcs_state_plant_lists_url = "{base}java/nrcsStateDownload?statefips={id}"
        nrcs_state_plant_file = "all_nrcs_state_plant.csv"
        table_name = "nrcs_state_plant"
        nrcs_state_plant_lists = [
            ("01", "Alabama"),
            ("02", "Alaska"),
            ("05", "Arkansas"),
            ("04", "Arizona"),
            ("06", "California"),
            ("08", "Colorado"),
            ("09", "Connecticut"),
            ("10", "Delaware"),
            ("12", "Florida"),
            ("13", "Georgia"),
            ("15", "Hawaii"),
            ("16", "Idaho"),
            ("17", "Illinois"),
            ("18", "Indiana"),
            ("19", "Iowa"),
            ("20", "Kansas"),
            ("21", "Kentucky"),
            ("22", "Louisiana"),
            ("23", "Maine"),
            ("24", "Maryland"),
            ("25", "Massachusetts"),
            ("26", "Michigan"),
            ("27", "Minnesota"),
            ("28", "Mississippi"),
            ("29", "Missouri"),
            ("30", "Montana"),
            ("31", "Nebraska"),
            ("32", "Nevada"),
            ("33", "New Hampshire"),
            ("34", "New Jersey"),
            ("35", "New Mexico"),
            ("36", "New York"),
            ("37", "North Carolina"),
            ("38", "North Dakota"),
            ("39", "Ohio"),
            ("40", "Oklahoma"),
            ("41", "Oregon"),
            ("42", "Pennsylvania"),
            ("44", "Rhode Island"),
            ("45", "South Carolina"),
            ("46", "South Dakota"),
            ("47", "Tennessee"),
            ("48", "Texas"),
            ("49", "Utah"),
            ("50", "Vermont"),
            ("51", "Virginia"),
            ("53", "Washington"),
            ("54", "West Virginia"),
            ("55", "Wisconsin"),
            ("56", "Wyoming"),
            ("72", "Puerto Rico"),
            ("78", "Virgin Islands"),
        ]

        with open_fw(
                engine.format_filename(nrcs_state_plant_file)) as write_object:
            for state_info in nrcs_state_plant_lists:
                file_name = state_info[1].replace(" ", "_").replace(
                    ".", "").lower() + ".csv"
                file_name = "old_nrcs_state_plant_" + file_name
                state_url = nrcs_state_plant_lists_url.format(base=base_url,
                                                              id=state_info[0])
                self.engine.download_file(state_url, filename=file_name)
                with open_fr(engine.format_filename(file_name)) as read_object:
                    # Read state file and only write the data minus header
                    next(read_object)
                    state_quoted = '"{state}",'.format(state=state_info[1])
                    for line in read_object:
                        write_object.write(state_quoted + line)

        data_path = self.engine.format_filename(nrcs_state_plant_file)
        table = Table(table_name, delimiter=",", header_rows=0)
        table.columns = [
            ("state", ("char", "17")),
            ("symbol", ("char", "7")),
            ("synonym_symbol", ("char", "7")),
            ("scientific_name_with_author", ("char", "183")),
            ("state_common_name", ("char", "42")),
            ("family", ("char", "17")),
        ]
        self.engine.auto_create_table(table, filename=nrcs_state_plant_file)
        self.engine.insert_data_from_file(data_path)
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species",
                          cleanup=Cleanup(),
                          contains_pk=True,
                          header_rows=9)

            table.columns = [
                ("species_id", ("pk-int", )),
                ("AOU", ("int", )),
                ("english_common_name", ("char", 50)),
                ("french_common_name", ("char", 50)),
                ("spanish_common_name", ("char", 50)),
                ("sporder", ("char", 30)),
                ("family", ("char", 30)),
                ("genus", ("char", 30)),
                ("species", ("char", 50)),
            ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()
            engine.auto_create_table(Table("weather",
                                           pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes",
                          pk=False,
                          header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"
                }
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int", )),
                             ("regioncode", ("int", )),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns = [("RouteDataID", ("int", )),
                             ("countrynum", ("int", )),
                             ("statenum", ("int", )), ("Route", ("int", )),
                             ("RPID", ("int", )), ("year", ("int", )),
                             ("AOU", ("int", )), ("Stop1", ("int", )),
                             ("Stop2", ("int", )), ("Stop3", ("int", )),
                             ("Stop4", ("int", )), ("Stop5", ("int", )),
                             ("Stop6", ("int", )), ("Stop7", ("int", )),
                             ("Stop8", ("int", )), ("Stop9", ("int", )),
                             ("Stop10", ("int", )), ("Stop11", ("int", )),
                             ("Stop12", ("int", )), ("Stop13", ("int", )),
                             ("Stop14", ("int", )), ("Stop15", ("int", )),
                             ("Stop16", ("int", )), ("Stop17", ("int", )),
                             ("Stop18", ("int", )), ("Stop19", ("int", )),
                             ("Stop20", ("int", )), ("Stop21", ("int", )),
                             ("Stop22", ("int", )), ("Stop23", ("int", )),
                             ("Stop24", ("int", )), ("Stop25", ("int", )),
                             ("Stop26", ("int", )), ("Stop27", ("int", )),
                             ("Stop28", ("int", )), ("Stop29", ("int", )),
                             ("Stop30", ("int", )), ("Stop31", ("int", )),
                             ("Stop32", ("int", )), ("Stop33", ("int", )),
                             ("Stop34", ("int", )), ("Stop35", ("int", )),
                             ("Stop36", ("int", )), ("Stop37", ("int", )),
                             ("Stop38", ("int", )), ("Stop39", ("int", )),
                             ("Stop40", ("int", )), ("Stop41", ("int", )),
                             ("Stop42", ("int", )), ("Stop43", ("int", )),
                             ("Stop44", ("int", )), ("Stop45", ("int", )),
                             ("Stop46", ("int", )), ("Stop47", ("int", )),
                             ("Stop48", ("int", )), ("Stop49", ("int", )),
                             ("Stop50", ("int", ))]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1, 11):
                part = str(part)
                try:
                    print("Inserting data from part " + part + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + "Fifty" + part + ".zip",
                            ["fifty" + part + ".csv"])
                    except:
                        print("Failed bulk insert on " + part +
                              ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + "Fifty" + part + ".zip",
                            ["fifty" + part + ".csv"])

                except:
                    print("There was an error in part " + part + ".")
                    raise

        except zipfile.BadZipfile:
            print(
                "There was an unexpected error in the Breeding Bird Survey archives."
            )
            raise

        return engine
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"], "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow(["Number", "Family", "Binomial", "Wood_Density", "Region", "Reference_Number"])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int",)),
                         ("Family", ("char",)),
                         ("Binomial", ("char",)),
                         ("Wood_Density", ("double",)),
                         ("Region", ("char",)),
                         ("Reference_Number", ("int",))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value, object_encoding=sys.stdout) for column_value in row]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int",)), ("Reference", ("char",))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        engine = self.engine

        # download and create species table
        table = Table('species')
        self.engine.auto_create_table(table, url=self.urls['species'])
        self.engine.insert_data_from_url(self.urls['species'])

        # State abbreviations with the year annual inventory began for that state
        stateslist = [('AL', 2001), ('AK', 2004), ('AZ', 2001), ('AR', 2000),
                      ('CA', 2001), ('CO', 2002), ('CT', 2003), ('DE', 2004),
                      ('FL', 2003), ('GA', 1998), ('ID', 2004), ('IL', 2001),
                      ('IN', 1999), ('IA', 1999), ('KS', 2001), ('KY', 1999),
                      ('LA', 2001), ('ME', 1999), ('MD', 2004), ('MA', 2003),
                      ('MI', 2000), ('MN', 1999), ('MO', 1999), ('MS', 2006),
                      ('MT', 2003), ('NE', 2001), ('NV', 2004), ('NH', 2002),
                      ('NJ', 2004), ('NM', 1999), ('NY', 2002), ('NC', 2003),
                      ('ND', 2001), ('OH', 2001), ('OK', 2008), ('OR', 2001),
                      ('PA', 2000), ('RI', 2003), ('SC', 1999), ('SD', 2001),
                      ('TN', 2000), ('TX', 2001), ('UT', 2000), ('VT', 2003),
                      ('VA', 1998), ('WA', 2002), ('WV', 2004), ('WI', 2000),
                      ('WY', 2000), ('PR', 2001)]

        tablelist = [
            "SURVEY", "PLOT", "COND", "SUBPLOT", "SUBP_COND", "TREE",
            "SEEDLING"
        ]

        for table in tablelist:
            for state, year in stateslist:

                engine.download_files_from_archive(
                    self.urls["main"] + state + "_" + table + ".ZIP",
                    [state + "_" + table + ".csv"])

        for table in tablelist:
            print("Scanning data for table %s..." % table)
            prep_file_name = "%s.csv" % table
            prep_file = open_fw(engine.format_filename(prep_file_name))
            this_file = open_fr(
                engine.format_filename(stateslist[0][0] + "_" + table +
                                       ".csv"))
            col_names = this_file.readline()
            prep_file.write(col_names)
            column_names = [col.strip('"') for col in col_names.split(',')]
            year_column = column_names.index("INVYR")
            this_file.close()

            for state, year in stateslist:
                this_file = open_fr(
                    engine.format_filename(state + "_" + table + ".csv"))
                this_file.readline()
                for line in this_file:
                    values = line.split(',')
                    this_year = values[year_column]
                    if int(this_year) >= year:
                        prep_file.write(line)
            prep_file.close()
            engine.auto_create_table(Table(table), filename=prep_file_name)
            engine.insert_data_from_file(
                engine.format_filename(prep_file_name))
            try:
                os.remove(engine.format_filename(prep_file_name))
            except:
                pass
        return engine
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns = [("species_id", ("pk-int",)),
                             ("AOU", ("int",)),
                             ("english_common_name", ("char", 50)),
                             ("french_common_name", ("char", 50)),
                             ("spanish_common_name", ("char", 50)),
                             ("sporder", ("char", 30)),
                             ("family", ("char", 30)),
                             ("genus", ("char", 30)),
                             ("species", ("char", 50)),
                             ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(
                self.urls["routes"], ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()
            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {
                    chr(225): "a",
                    chr(233): "e",
                    chr(237): "i",
                    chr(243): "o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int",)),
                             ("regioncode", ("int",)),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", pk=False, delimiter=',')
            table.columns = [("RouteDataID", ("int",)),
                             ("countrynum", ("int",)),
                             ("statenum", ("int",)),
                             ("Route", ("int",)),
                             ("RPID", ("int",)),
                             ("year", ("int",)),
                             ("AOU", ("int",)),
                             ("Stop1", ("int",)),
                             ("Stop2", ("int",)),
                             ("Stop3", ("int",)),
                             ("Stop4", ("int",)),
                             ("Stop5", ("int",)),
                             ("Stop6", ("int",)),
                             ("Stop7", ("int",)),
                             ("Stop8", ("int",)),
                             ("Stop9", ("int",)),
                             ("Stop10", ("int",)),
                             ("Stop11", ("int",)),
                             ("Stop12", ("int",)),
                             ("Stop13", ("int",)),
                             ("Stop14", ("int",)),
                             ("Stop15", ("int",)),
                             ("Stop16", ("int",)),
                             ("Stop17", ("int",)),
                             ("Stop18", ("int",)),
                             ("Stop19", ("int",)),
                             ("Stop20", ("int",)),
                             ("Stop21", ("int",)),
                             ("Stop22", ("int",)),
                             ("Stop23", ("int",)),
                             ("Stop24", ("int",)),
                             ("Stop25", ("int",)),
                             ("Stop26", ("int",)),
                             ("Stop27", ("int",)),
                             ("Stop28", ("int",)),
                             ("Stop29", ("int",)),
                             ("Stop30", ("int",)),
                             ("Stop31", ("int",)),
                             ("Stop32", ("int",)),
                             ("Stop33", ("int",)),
                             ("Stop34", ("int",)),
                             ("Stop35", ("int",)),
                             ("Stop36", ("int",)),
                             ("Stop37", ("int",)),
                             ("Stop38", ("int",)),
                             ("Stop39", ("int",)),
                             ("Stop40", ("int",)),
                             ("Stop41", ("int",)),
                             ("Stop42", ("int",)),
                             ("Stop43", ("int",)),
                             ("Stop44", ("int",)),
                             ("Stop45", ("int",)),
                             ("Stop46", ("int",)),
                             ("Stop47", ("int",)),
                             ("Stop48", ("int",)),
                             ("Stop49", ("int",)),
                             ("Stop50", ("int",))]

            part = ""
            engine.table = table
            engine.create_table()

            for part in range(1, 11):
                part = str(part)
                try:
                    print("Inserting data from part " + part + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])
                    except:
                        print(
                            "Failed bulk insert on " +
                            part +
                            ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(self.urls["counts"] +
                                                        "Fifty" + part + ".zip",
                                                        ["fifty" + part + ".csv"])

                except:
                    print("There was an error in part " + part + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine
Beispiel #40
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)
        reload(sys)
        if hasattr(sys, 'setdefaultencoding'):
            sys.setdefaultencoding("utf-8")

        self.engine.download_file(self.urls["GWDD"],
                                  "GlobalWoodDensityDatabase.xls")
        filename = os.path.basename("GlobalWoodDensityDatabase.xls")
        book = xlrd.open_workbook(self.engine.format_filename(filename))
        sh = book.sheet_by_index(1)
        rows = sh.nrows

        # Creating data files
        file_path = self.engine.format_filename("gwdd_data.csv")
        gwdd_data = open_fw(file_path)
        csv_writer = open_csvw(gwdd_data)
        csv_writer.writerow([
            "Number", "Family", "Binomial", "Wood_Density", "Region",
            "Reference_Number"
        ])

        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [to_str(column_value.value) for column_value in row]
            csv_writer.writerow(row_as_list)
        gwdd_data.close()

        table = Table("data", delimiter=",")
        table.columns = [("Number", ("pk-int", )), ("Family", ("char", )),
                         ("Binomial", ("char", )),
                         ("Wood_Density", ("double", )),
                         ("Region", ("char", )),
                         ("Reference_Number", ("int", ))]
        table.pk = 'Number'
        table.contains_pk = True

        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        # Creating reference tale file
        file_path = self.engine.format_filename("gwdd_ref.csv")
        ref_file = open_fw(file_path)
        csv_writerd = open_csvw(ref_file)
        csv_writerd.writerow(["Reference_Number", "Reference"])
        sh = book.sheet_by_index(2)
        rows = sh.nrows
        for index in range(1, rows):
            row = sh.row(index)
            # get each row and format the sell value.
            row_as_list = [
                to_str(column_value.value, object_encoding=sys.stdout)
                for column_value in row
            ]
            csv_writerd.writerow(row_as_list)
        ref_file.close()

        table = Table("reference", delimiter=",")
        table.columns = [("Reference_Number", ("pk-int", )),
                         ("Reference", ("char", ))]
        table.pk = 'Reference_Number'
        table.contains_pk = True
        self.engine.table = table
        self.engine.create_table()
        self.engine.insert_data_from_file(engine.format_filename(file_path))

        return self.engine
    def download(self, engine=None, debug=False):
        try:
            Script.download(self, engine, debug)

            engine = self.engine

            # Species table
            table = Table("species", cleanup=Cleanup(), contains_pk=True,
                          header_rows=9)

            table.columns = [("species_id", ("pk-int",)),
                             ("AOU", ("int",)),
                             ("english_common_name", ("char", 50)),
                             ("french_common_name", ("char", 50)),
                             ("spanish_common_name", ("char", 50)),
                             ("sporder", ("char", 30)),
                             ("family", ("char", 30)),
                             ("genus", ("char", 30)),
                             ("species", ("char", 50)),
                             ]
            table.fixed_width = [7, 6, 51, 51, 51, 51, 51, 51, 50]

            engine.table = table
            engine.create_table()
            engine.insert_data_from_url(self.urls["species"])

            # Routes table
            engine.download_files_from_archive(self.urls["routes"],
                                               ["routes.csv"])
            engine.auto_create_table(Table("routes", cleanup=Cleanup()),
                                     filename="routes.csv")
            engine.insert_data_from_file(engine.format_filename("routes.csv"))

            # Weather table
            if not os.path.isfile(engine.format_filename("weather_new.csv")):
                engine.download_files_from_archive(self.urls["weather"],
                                                   ["weather.csv"])
                read = open_fr(engine.format_filename("weather.csv"))
                write = open_fw(engine.format_filename("weather_new.csv"))
                print("Cleaning weather data...")
                for line in read:
                    values = line.split(',')
                    newvalues = []
                    for value in values:

                        if ':' in value:
                            newvalues.append(value.replace(':', ''))
                        elif value == "N":
                            newvalues.append(None)
                        else:
                            newvalues.append(value)
                    write.write(','.join(str(value) for value in newvalues))
                write.close()
                read.close()

            engine.auto_create_table(Table("weather", pk="RouteDataId",
                                           cleanup=self.cleanup_func_table),
                                     filename="weather_new.csv")
            engine.insert_data_from_file(
                engine.format_filename("weather_new.csv"))

            # Region_codes table
            table = Table("region_codes", pk=False, header_rows=11,
                          fixed_width=[11, 11, 30])

            def regioncodes_cleanup(value, engine):
                replace = {chr(225): "a", chr(233): "e", chr(237): "i", chr(243): "o"}
                newvalue = str(value)
                for key in list(replace.keys()):
                    if key in newvalue:
                        newvalue = newvalue.replace(key, replace[key])
                return newvalue

            table.cleanup = Cleanup(regioncodes_cleanup)

            table.columns = [("countrynum", ("int",)),
                             ("regioncode", ("int",)),
                             ("regionname", ("char", 30))]

            engine.table = table
            engine.create_table()

            engine.insert_data_from_url(self.urls["region_codes"])

            # Counts table
            table = Table("counts", delimiter=',')

            table.columns = [("record_id", ("pk-auto",)),
                             ("RouteDataID", ("int",)),
                             ("countrynum", ("int",)),
                             ("statenum", ("int",)),
                             ("Route", ("int",)),
                             ("RPID", ("int",)),
                             ("Year", ("int",)),
                             ("Aou", ("int",)),
                             ("Count10", ("int",)),
                             ("Count20", ("int",)),
                             ("Count30", ("int",)),
                             ("Count40", ("int",)),
                             ("Count50", ("int",)),
                             ("StopTotal", ("int",)),
                             ("SpeciesTotal", ("int",))]

            stateslist = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
                          "Connecticut", "Delaware", "Florida", "Georgia", "Idaho",
                          "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
                          "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                          "Missouri", "Montana", "Nebraska", "Nevada",
                          ["New Hampshire", "NHampsh"], ["New Jersey", "NJersey"],
                          ["New Mexico", "NMexico"], ["New York", "NYork"],
                          ["North Carolina", "NCaroli"], ["North Dakota", "NDakota"], "Ohio",
                          "Oklahoma", "Oregon", "Pennsylvania", ["Rhode Island", "RhodeIs"],
                          ["South Carolina", "SCaroli"], ["South Dakota", "SDakota"], "Tennessee",
                          "Texas", "Utah", "Vermont", "Virginia", "Washington",
                          ["West Virginia", "W_Virgi"], "Wisconsin", "Wyoming", "Alberta",
                          ["British Columbia", "BritCol"], "Manitoba", ["New Brunswick", "NBrunsw"],
                          ["Northwest Territories", "NWTerri"], "Newfoundland",
                          ["Nova Scotia", "NovaSco"], "Nunavut", "Ontario",
                          ["Prince Edward Island", "PEI"], "Quebec", "Saskatchewan", "Yukon"]

            state = ""
            shortstate = ""

            engine.table = table
            engine.create_table()

            for state in stateslist:
                try:
                    if isinstance(state, (list,)):
                        state, shortstate = state[0], state[1]
                    else:
                        shortstate = state[0:7]

                    print("Inserting data from " + state + "...")
                    try:
                        engine.table.cleanup = Cleanup()
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip", [shortstate + ".csv"])
                    except:
                        print(
                            "Failed bulk insert on " +
                            state +
                            ", inserting manually.")
                        engine.connection.rollback()
                        engine.table.cleanup = self.cleanup_func_clean
                        engine.insert_data_from_archive(
                            self.urls["counts"] + shortstate + ".zip",
                            [shortstate + ".csv"])

                except:
                    print("There was an error in " + state + ".")
                    raise

        except zipfile.BadZipfile:
            print("There was an unexpected error in the Breeding Bird Survey archives.")
            raise

        return engine
Beispiel #42
0
# sys removes the setdefaultencoding method at startup; reload to get it back
reload(sys)
if hasattr(sys, 'setdefaultencoding'):
    # set default encoding to latin-1 to decode source text
    sys.setdefaultencoding('latin-1')


def to_str(object, object_encoding=encoding):
    if sys.version_info >= (3, 0, 0):
        return str(object).encode('UTF-8').decode(encoding)
    return object


# Create the .rst file for the available datasets
datasetfile = open_fw("datasets_list.rst")
datasetfile_title = """==================
Datasets Available
==================


"""
check_for_updates()
reload_scripts()
script_list = SCRIPT_LIST()

# write the title of dataset rst file
# ref:http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
datasetfile.write(datasetfile_title)

# get info from the scripts using specified encoding
Beispiel #43
0
    def download(self, engine=None, debug=False):
        Script.download(self, engine, debug)

        engine = self.engine
        csv_files = []
        request_src = "http://www.data-retriever.org/"
        base_url = "http://www.usanpn.org/npn_portal/observations/getObservations.xml?start_date={startYear}&end_date={endYear_date}&request_src={request_src}"
        header_values = [
            "observation_id", "update_datetime", "site_id", "latitude",
            "longitude", "elevation_in_meters", "state", "species_id", "genus",
            "species", "common_name", "kingdom", "individual_id",
            "phenophase_id", "phenophase_description", "observation_date",
            "day_of_year", "phenophase_status", "intensity_category_id",
            "intensity_value", "abundance_value"
        ]

        columns = [
            ("record_id", ("pk-auto", )),
            ("observation_id",
             ("int", )),  # subsequently refered to as "status record"
            ("update_datetime", ("char", )),
            ("site_id", ("int", )),
            ("latitude", ("double", )),
            ("longitude", ("double", )),
            ("elevation_in_meters", ("char", )),
            ("state", ("char", )),
            ("species_id", ("int", )),
            ("genus", ("char", )),
            ("species", ("char", )),
            ("common_name", ("char", )),
            ("kingdom", ("char", )),  # skip kingdom
            ("individual_id", ("char", )),
            ("phenophase_id", ("int", )),
            ("phenophase_description", ("char", )),
            ("observation_date", ("char", )),
            ("day_of_year", ("char", )),
            ("phenophase_status", ("char", )),
            ("intensity_category_id", ("char", )),
            ("intensity_value", ("char", )),
            ("abundance_value", ("char", ))
        ]

        start_date = datetime.date(2009, 1, 1)
        end_date = datetime.date.today()

        while start_date < end_date:
            to_date = start_date + datetime.timedelta(90)
            if to_date >= end_date:
                data_url = base_url.format(startYear=str(start_date),
                                           endYear_date=str(end_date),
                                           request_src=request_src)
            else:
                data_url = base_url.format(startYear=str(start_date),
                                           endYear_date=str(to_date),
                                           request_src=request_src)

            xml_file_name = '{}'.format(start_date) + ".xml"
            engine.download_file(data_url, xml_file_name)

            # Create csv files for 3 months
            csv_observation = '{}'.format(start_date) + ".csv"
            csv_files.append(csv_observation)
            csv_buff = open_fw(engine.format_filename(csv_observation))
            csv_writer = open_csvw(csv_buff)

            csv_writer.writerow(header_values)

            # Parse xml to read data
            file_read = ""
            fname = DATA_WRITE_PATH.strip('{dataset}') + 'NPN/' + xml_file_name
            with open(fname, 'r') as fp1:
                file_read = fp1.read()

            root = ET.fromstring(file_read)

            for elements in root:
                index_map = {val: i for i, val in enumerate(header_values)}
                diction = sorted(elements.attrib.items(),
                                 key=lambda pair: index_map[pair[0]])
                csv_writer.writerow([x[1] for x in diction])

            csv_buff.close()
            start_date = to_date + datetime.timedelta(1)

        # Create table
        table = Table('obsercations',
                      delimiter=',',
                      pk='record_id',
                      contains_pk=True)
        table.columns = columns
        engine.table = table
        engine.create_table()
        for data_file in csv_files:
            engine.insert_data_from_file(engine.find_file(data_file))
        return engine
Beispiel #44
0
 def create_table(self):
     """Create the table by creating an empty json file"""
     self.output_file = open_fw(self.table_name())
     self.output_file.write("[")
     self.table_names.append((self.output_file, self.table_name()))
     self.auto_column_number = 1