コード例 #1
0
    def test_file_checking(self):
        '''
        Various quality assurance checks on the DataReader's validation functions
        - include_flag in manifest used properly to determine loading or skipping
        - CSV file being loaded has the expected number of fields as found in meta.json
        - convenience function should_file_be_loaded performs properly
        '''
        META_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/meta_sample.json')
        meta = load_meta_data(META_FILENAME)
        table_name = 'buildings'

        MANIFEST_FILENAME = os.path.join(os.path.dirname(__file__), 'test_data/manifest_sample.csv')
        manifest = ManifestReader(MANIFEST_FILENAME)

        for manifest_row in manifest:
            print(manifest_row)
            csv_reader = DataReader(meta = meta, manifest_row=manifest_row)
            assert(csv_reader._do_fields_match())
            
             #assume they exactly match for the purpose of this test, expect as modified below
            sql_manifest_row = manifest_row
            #sql_manifest_row has an extra field to indicate whether it has been successfully loaded in previous runs or not
            sql_manifest_row['status'] = 'failed'

            if manifest_row['include_flag'] == "skip":
                self.assertFalse(csv_reader._check_include_flag(sql_manifest_row))
                self.assertFalse(csv_reader.should_file_be_loaded(sql_manifest_row))
            elif manifest_row['include_flag'] == "use":
                self.assertTrue(csv_reader._check_include_flag(sql_manifest_row))
                self.assertTrue(csv_reader.should_file_be_loaded(sql_manifest_row))

            #If the file is already in the SQL manifest, skip it when manifest says 'use' (i.e. it's already loaded)
            sql_manifest_row['status'] = 'loaded'
            if manifest_row['include_flag'] == "use":
                self.assertFalse(csv_reader._check_include_flag(sql_manifest_row))
コード例 #2
0
    def __init__(self,
                 database_choice=None,
                 meta_path=None,
                 manifest_path=None,
                 keep_temp_files=True,
                 drop_tables=False):
        """
        Initializes the class with optional arguments. The default behaviour 
        is to load the local database with data tracked from meta.json 
        and manifest.csv within the 'python/scripts' folder.
        
        :param database_choice: choice of 'local_database', 
        'docker_database', and 'remote_database'
        :param meta_path: the path of the meta.json to be used
        :param manifest_path: the path of the manifest_path.csv to be used
        :param keep_temp_files: if True, temp clean pipe-delimited files will be
        archived in the 'python/logs' folder
        """

        # load defaults if no arguments passed
        _scripts_path = os.path.abspath(os.path.join(PYTHON_PATH, 'scripts'))
        if database_choice is None:
            self.database_choice = 'docker_database'
        else:
            self.database_choice = database_choice
        if meta_path is None:
            meta_path = os.path.abspath(
                os.path.join(_scripts_path, 'meta.json'))
        if manifest_path is None:
            manifest_path = os.path.abspath(
                os.path.join(_scripts_path, 'manifest.csv'))
        self._keep_temp_files = keep_temp_files

        # load given meta.json and manifest.csv files into memory
        self.meta = ingestionfunctions.load_meta_data(meta_path)
        self.manifest = Manifest(manifest_path)

        # setup engine for database_choice
        self.engine = dbtools.get_database_engine(self.database_choice)

        # write the meta.json to the database
        self._meta_json_to_database()

        self._failed_table_count = 0

        self.drop_tables = drop_tables
コード例 #3
0
def main(database_choice, meta_path, manifest_path, keep_temp_files=True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which
         reflects manifest.csv as of the last time this script was run, and
         tells us whether or not the file has already been loaded in this
         database.
        - Use meta.json to make sure the CSV file has all the fields we expect
        it to, and to decide the data type of each field.
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not
    matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the
    log file
    - at the end of loading print an error message to the console
    """

    # load given meta.json and manifest.csv files into memory
    meta = ingestionfunctions.load_meta_data(meta_path)
    manifest = ManifestReader(manifest_path)

    # load given database choice and check/create sql manifest
    engine = dbtools.get_database_engine(database_choice)
    sql_manifest_exists = \
        ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    # Iterate through each row in the manifest then clean and validate. If it
    # passes validation, then write to temp psv file for loading into database.
    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        # TODO: figure out a way to flag this issue early in loading of manifest

        # only clean and validate data files flagged for use in database
        if manifest_row['include_flag'] == 'use':
            logging.info(
                "{}: preparing to load row {} from the manifest".format(
                    manifest_row['unique_data_id'], len(manifest)))

            temp_filepath = os.path.abspath(
                os.path.join(
                    logging_path,
                    'temp_{}.psv'.format(manifest_row['unique_data_id'])))
            # prepare csv reader and writer objects
            csv_reader = DataReader(meta=meta,
                                    manifest_row=manifest_row,
                                    load_from="file")
            csv_writer = CSVWriter(meta=meta,
                                   manifest_row=manifest_row,
                                   filename=temp_filepath)

            # prepare objects for interfacing with database and then get
            # the equivalent manifest row from the database
            sql_interface = HISql(meta=meta,
                                  manifest_row=manifest_row,
                                  engine=engine,
                                  filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                meta=meta, manifest_row=manifest_row, name=cleaner_class_name)

            # Identify fields that exist in meta.json but not CSV
            # so we can add them to the row as it is cleaned and loaded.
            meta_only_fields = {}
            for field in meta[tablename]['fields']:
                if field['source_name'] not in csv_reader.keys:
                    # adds 'sql_name',None as key,value pairs in dict
                    meta_only_fields[field['sql_name']] = None

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(
                    sql_manifest_row=sql_manifest_row):
                print("  Cleaning...")
                for idx, data_row in enumerate(csv_reader):
                    data_row.update(
                        meta_only_fields)  # insert other field dict
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Loading...")

                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()

                #Appends to table; if dropped, it recreates
                sql_interface.create_table_if_necessary()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors.
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()
コード例 #4
0
from housinginsights.ingestion import functions as ingestionfunctions

# SETUP - Mostly taken from load_data.py
this_file = os.path.realpath(__file__)
this_dir = os.path.dirname(this_file)

meta_path = os.path.abspath(
    this_dir +
    '/test_data/meta_sample.json')  # TODO Update when meta.json is updated.
manifest_path = os.path.abspath(
    this_dir +
    '/test_data/manifest_sample.csv')  # TODO Consider updating manifest_sample
database_choice = 'docker_database'
logging_path = os.path.abspath("../logs")

meta = ingestionfunctions.load_meta_data(meta_path)
engine = dbtools.get_database_engine(database_choice)
manifest = ManifestReader(manifest_path)

FAKE_REQUIRED_FIELD = {
    'sql_name': 'foo',
    'type': 'decimal',
    'source_name': 'FAKE_REQUIRED',
    'display_text': 'Blah',
    'display_name': 'x',
    'required_in_source': True
}
FAKE_OPTIONAL_FIELD = {
    'sql_name': 'foo',
    'type': 'decimal',
    'source_name': 'FAKE_OPTIONAL',
コード例 #5
0
def main(database_choice, meta_path, manifest_path, keep_temp_files = True):
    """
    Main routine, calls all the other ones in this file as needed.

    Big picture:
        - Use manifest.csv to find out all the files we want to load
        - Compare current manifest.csv to the sql database manifest table, which reflects manifest.csv as
          of the last time this script was run, and tells us whether or not the file has already been loaded in this database.
        - Use meta.json to make sure the CSV file has all the fields we expect it to, and to decide the data type of each field
        - Load the csv into the database

    If there is an error loading a file (either flagged error from fields not matching, or parsing error if data type isn't right):
    - skip loading this file,
    - report the error to SQL using update_sql_manifest(status="error")
    - use logging.warning() to write the specific error encountered to the log file
    - at the end of loading print an error message to the console
    """
    meta = ingestionfunctions.load_meta_data(meta_path)
    engine = dbtools.get_database_engine(database_choice)
    manifest = ManifestReader(manifest_path)

    sql_manifest_exists = ingestionfunctions.check_or_create_sql_manifest(engine=engine)
    logging.info("sql_manifest_exists: {}".format(sql_manifest_exists))

    #TODO should this be moved into the __init__ of ManifestReader? Do we ever want to use ManifestReader if it has duplicate rows?
    if not manifest.has_unique_ids():
        raise ValueError('Manifest has duplicate unique_data_id!')

    for manifest_row in manifest:
        #Incompletely filled out rows in the manifest can break the other code
        if manifest_row['include_flag'] == 'use':
            logging.info("{}: preparing to load row {} from the manifest".format(manifest_row['unique_data_id'],len(manifest)))

            temp_filepath = os.path.abspath(
                                os.path.join(
                                    logging_path,
                                    'temp_{}.psv'.format(manifest_row['unique_data_id'])
                                ))
            csv_reader = DataReader(meta = meta, manifest_row=manifest_row, load_from="file")
            csv_writer = CSVWriter(meta = meta, manifest_row = manifest_row, filename = temp_filepath)
            sql_interface = HISql(meta = meta, manifest_row = manifest_row, engine = engine, filename=temp_filepath)
            sql_manifest_row = sql_interface.get_sql_manifest_row()

            #Assign an appropriate testing cleaner
            tablename = manifest_row['destination_table']
            cleaner_class_name = meta[tablename]['cleaner']
            cleaner = ingestionfunctions.get_cleaner_from_name(
                                        meta=meta, 
                                        manifest_row=manifest_row, 
                                        name= cleaner_class_name)

            #clean the file and save the output to a local pipe-delimited file
            if csv_reader.should_file_be_loaded(sql_manifest_row=sql_manifest_row):
                print("  Ready to clean {}".format(csv_reader.destination_table))
                for idx, data_row in enumerate(csv_reader):
                    clean_data_row = cleaner.clean(data_row, idx)
                    if clean_data_row != None:
                        csv_writer.write(clean_data_row)

                csv_writer.close()
                print("  Ready to load")
                
                #Decide whether to append or replace the table
                if meta[tablename]["replace_table"] == True:
                    logging.info("  replacing existing table")
                    sql_interface.drop_table()
                
                #Appends to table; if dropped, it recreates
                sql_interface.create_table()
                try:
                    sql_interface.write_file_to_sql()
                except TableWritingError:
                    #TODO tell user total count of errors. 
                    #currently write_file_to_sql() just writes in log that file failed
                    pass
                if keep_temp_files == False:
                    csv_writer.remove_file()