def _get_cleaner(self, table_name, manifest_row):
     """
     Returns the custom cleaner class that is to be used to clean the 
     specific data for use in database.
     
     :param table_name: the table name for that data being processed
     :param manifest_row: the row representing the data being loaded
     :return: instance of custom cleaner class
     """
     cleaner_class_name = self.meta[table_name]['cleaner']
     return ingestionfunctions.get_cleaner_from_name(
         meta=self.meta, manifest_row=manifest_row, name=cleaner_class_name)
Exemple #2
0
def main(database_choice, meta_path, manifest_path, keep_temp_files=True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which
         reflects manifest.csv as of the last time this script was run, and
         tells us whether or not the file has already been loaded in this
         database.
        - Use meta.json to make sure the CSV file has all the fields we expect
        it to, and to decide the data type of each field.
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not
    matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the
    log file
    - at the end of loading print an error message to the console
    """

    # load given meta.json and manifest.csv files into memory
    meta = ingestionfunctions.load_meta_data(meta_path)
    manifest = ManifestReader(manifest_path)

    # load given database choice and check/create sql manifest
    engine = dbtools.get_database_engine(database_choice)
    sql_manifest_exists = \
        ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    # Iterate through each row in the manifest then clean and validate. If it
    # passes validation, then write to temp psv file for loading into database.
    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        # TODO: figure out a way to flag this issue early in loading of manifest

        # only clean and validate data files flagged for use in database
        if manifest_row['include_flag'] == 'use':
            logging.info(
                "{}: preparing to load row {} from the manifest".format(
                    manifest_row['unique_data_id'], len(manifest)))

            temp_filepath = os.path.abspath(
                os.path.join(
                    logging_path,
                    'temp_{}.psv'.format(manifest_row['unique_data_id'])))
            # prepare csv reader and writer objects
            csv_reader = DataReader(meta=meta,
                                    manifest_row=manifest_row,
                                    load_from="file")
            csv_writer = CSVWriter(meta=meta,
                                   manifest_row=manifest_row,
                                   filename=temp_filepath)

            # prepare objects for interfacing with database and then get
            # the equivalent manifest row from the database
            sql_interface = HISql(meta=meta,
                                  manifest_row=manifest_row,
                                  engine=engine,
                                  filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                meta=meta, manifest_row=manifest_row, name=cleaner_class_name)

            # Identify fields that exist in meta.json but not CSV
            # so we can add them to the row as it is cleaned and loaded.
            meta_only_fields = {}
            for field in meta[tablename]['fields']:
                if field['source_name'] not in csv_reader.keys:
                    # adds 'sql_name',None as key,value pairs in dict
                    meta_only_fields[field['sql_name']] = None

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(
                    sql_manifest_row=sql_manifest_row):
                print("  Cleaning...")
                for idx, data_row in enumerate(csv_reader):
                    data_row.update(
                        meta_only_fields)  # insert other field dict
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Loading...")

                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()

                #Appends to table; if dropped, it recreates
                sql_interface.create_table_if_necessary()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors.
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()
def main(database_choice, meta_path, manifest_path, keep_temp_files = True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which reflects manifest.csv as
          of the last time this script was run, and tells us whether or not the file has already been loaded in this database.
        - Use meta.json to make sure the CSV file has all the fields we expect it to, and to decide the data type of each field
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the log file
    - at the end of loading print an error message to the console
    """
    meta = ingestionfunctions.load_meta_data(meta_path)
    engine = dbtools.get_database_engine(database_choice)
    manifest = ManifestReader(manifest_path)

    sql_manifest_exists = ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        if manifest_row['include_flag'] == 'use':
            logging.info("{}: preparing to load row {} from the manifest".format(manifest_row['unique_data_id'],len(manifest)))

            temp_filepath = os.path.abspath(
                                os.path.join(
                                    logging_path,
                                    'temp_{}.psv'.format(manifest_row['unique_data_id'])
                                ))
            csv_reader = DataReader(meta = meta, manifest_row=manifest_row, load_from="file")
            csv_writer = CSVWriter(meta = meta, manifest_row = manifest_row, filename = temp_filepath)
            sql_interface = HISql(meta = meta, manifest_row = manifest_row, engine = engine, filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                                        meta=meta, 
                                        manifest_row=manifest_row, 
                                        name= cleaner_class_name)

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(sql_manifest_row=sql_manifest_row):
                print("  Ready to clean {}".format(csv_reader.destination_table))
                for idx, data_row in enumerate(csv_reader):
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Ready to load")
                
                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()
                
                #Appends to table; if dropped, it recreates
                sql_interface.create_table()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors. 
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()