Beispiel #1
0
def test_add_to_import_history_second_file(db_transact):
    helper_create_import_history_table(db_transact)
    f = NamedTemporaryFile(suffix=".csv")
    new_id = dbu.add_to_import_history(
        filepath=pathlib.Path(f.name),
        db_connection=db_transact,
        filetype="test1"
    )
    assert new_id == 1
    f2 = NamedTemporaryFile(suffix=".csv")
    f2.write(b"Hello, World")
    # Rewind the file head pointer to the beginning, to simulate reading the
    # file fresh.
    f2.seek(0)
    new_id = dbu.add_to_import_history(
        filepath=pathlib.Path(f2.name),
        db_connection=db_transact,
        filetype="test2"
    )
    assert new_id == 2
    import_history = pd.read_sql(
        sql=f"""
            SELECT *
            from {dbu.IMPORT_HISTORY_SCHEMA}.{dbu.IMPORT_HISTORY_TABLE}
        """,
        con=db_transact
    )
    npt.assert_array_equal(
        import_history.columns,
        import_history_cols
    )
    npt.assert_array_equal(
        import_history,
        pd.DataFrame([
            [
                1,  # id
                datetime(1970, 1, 2, 3, 4, 5),
                'test1',  # filetype
                pathlib.Path(f.name).name,  # filename
                # d41d8cd98f00b204e9800998ecf8427e = the md5 of an empty file
                'd41d8cd98f00b204e9800998ecf8427e'  # filehash
            ],
            [
                2,  # id
                datetime(1970, 1, 2, 3, 4, 20),  # add 15 sec for tick()
                'test2',  # filetype
                pathlib.Path(f2.name).name,  # filename
                '82bb413746aee42f89dea2b59614f9ef'  # filehash= b"Hello, World"
            ],
        ])
    )
Beispiel #2
0
def test_add_to_import_history_simple(db_transact):
    helper_create_import_history_table(db_transact)
    f = NamedTemporaryFile(suffix=".csv")
    new_id = dbu.add_to_import_history(
        filepath=pathlib.Path(f.name),
        db_connection=db_transact,
        filetype="test1"
    )
    import_history = pd.read_sql(
        sql=f"""
            SELECT *
            from {dbu.IMPORT_HISTORY_SCHEMA}.{dbu.IMPORT_HISTORY_TABLE}
        """,
        con=db_transact
    )
    assert new_id == 1
    npt.assert_array_equal(
        import_history.columns,
        import_history_cols
    )
    npt.assert_array_equal(
        import_history,
        pd.DataFrame([
            [
                1,  # id
                datetime(1970, 1, 2, 3, 4, 5),
                'test1',  # filetype
                pathlib.Path(f.name).name,  # filename
                # d41d8cd98f00b204e9800998ecf8427e = the md5 of an empty file
                'd41d8cd98f00b204e9800998ecf8427e'  # filehash
            ],
        ])
    )
Beispiel #3
0
def test_add_to_import_history_reimport_same_file(db_transact):
    helper_create_import_history_table(db_transact)
    f = NamedTemporaryFile(suffix=".csv")
    new_id = dbu.add_to_import_history(
        filepath=pathlib.Path(f.name),
        db_connection=db_transact,
        filetype="test1"
    )
    assert new_id == 1
    with pytest.raises(Exception) as excinfo:
        new_id = dbu.add_to_import_history(
            filepath=pathlib.Path(f.name),
            db_connection=db_transact,
            filetype="test1"
        )
    assert "Key (filehash)=(d41d8cd98f00b204e9800998ecf8427e) already exists" \
        in str(excinfo.value)
Beispiel #4
0
def test_check_if_file_imported_reimport_altered_file(db_transact):
    helper_create_import_history_table(db_transact)
    f = NamedTemporaryFile(suffix=".csv")
    filepath = pathlib.Path(f.name)
    new_id = dbu.add_to_import_history(
        filepath=filepath,
        db_connection=db_transact,
        filetype="test1"
    )
    assert new_id == 1
    f.write(b"Hello, World")
    f.seek(0)
    new_id = dbu.add_to_import_history(
        filepath=filepath,
        db_connection=db_transact,
        filetype="test1"
    )
    assert new_id == 2
    is_imported = dbu.check_if_file_imported(filepath, db_transact)
    assert is_imported == 2
Beispiel #5
0
def test_check_if_file_imported_simple(db_transact):
    helper_create_import_history_table(db_transact)
    f = NamedTemporaryFile(suffix=".csv")
    filepath = pathlib.Path(f.name)
    new_id = dbu.add_to_import_history(
        filepath=filepath,
        db_connection=db_transact,
        filetype="test1"
    )
    assert new_id == 1
    is_imported = dbu.check_if_file_imported(filepath, db_transact)
    assert is_imported == 1
Beispiel #6
0
def test_check_if_file_imported_second_file_same_hash_not(db_transact):
    helper_create_import_history_table(db_transact)
    f = NamedTemporaryFile(suffix=".csv")
    new_id = dbu.add_to_import_history(
        filepath=pathlib.Path(f.name),
        db_connection=db_transact,
        filetype="test1"
    )
    assert new_id == 1
    f2 = NamedTemporaryFile(suffix=".csv")
    filepath2 = pathlib.Path(f2.name)
    is_imported = dbu.check_if_file_imported(filepath2, db_transact)
    assert is_imported is None
def import_single_file(
    filepath,
    db_engine,
    data_files_path=pathlib.PurePosixPath('/')
        ):
    """Orchestrate reading and import a file."""
    if dbu.check_if_file_imported(filepath, db_engine):
        logging.info(
            f"Already imported: {filepath.relative_to(data_files_path)}"
        )
        return None
    logging.info(f"Importing: {filepath.relative_to(data_files_path)}")
    logging.debug(f"Absolute path: {filepath}")
    file_info = _determine_file_type(filepath=filepath)
    schemaname = 'rawdata'

    with db_engine.begin() as db_con:
        columns_info = dbu.get_db_column_info(
            db_connection=db_con,
            tablename=file_info['tablename'],
            schemaname=schemaname
        )
    columns_name_list = list(columns_info['column_name'])
    df = file_info['parser'](filepath, columns_name_list)

    # Using the context manager allows the adding to import history and writing
    # to DB to be in the same transaction, and it will rollback if it fails.
    with db_engine.begin() as db_con:
        import_id = dbu.add_to_import_history(
            filepath=filepath,
            db_connection=db_con,
            filetype=file_info['filetype']
        )
        df['import_history_id'] = import_id
        dbu.write_df_to_db(
            df=df,
            db_connection=db_con,
            tablename=file_info['tablename'],
            schemaname=schemaname
        )
    return import_id