Example #1
0
def test_archived_file_creates_a_new_artifact_when_custom_fields_are_different(
    dbdiskrepo, ):
    assert p.get_default_repo() is not None
    tmp_dir = tempfile.mkdtemp('prov_integration_archive_test')
    data_filename = os.path.join(tmp_dir, 'data.csv')
    pd.DataFrame({
        'a': [0, 1, 2],
        'b': [10, 11, 12]
    }).to_csv(data_filename, index=False)

    data_filename2 = os.path.join(tmp_dir, 'data2.csv')
    shutil.copyfile(data_filename, data_filename2)

    archived_file = p.archive_file(
        data_filename,
        delete_original=True,
        custom_fields={'data_source': 'provider one'},
    )
    archived_file2 = p.archive_file(
        data_filename2,
        delete_original=True,
        custom_fields={'data_source': 'provider two'},
    )

    assert archived_file.artifact.id != archived_file2.artifact.id
    assert archived_file.artifact.value_id == archived_file2.artifact.value_id
    assert archived_file.artifact.custom_fields == {
        'data_source': 'provider one'
    }
    assert archived_file2.artifact.custom_fields == {
        'data_source': 'provider two'
    }
    def write_entry():
        with open(os.path.join(directory, 'demographic.json'), 'w') as demof:
            json.dump(demographics, demof)

        with open(os.path.join(directory, 'matrix.csv'), 'w') as matrixf:
            writer = csv.writer(matrixf)
            writer.writerows(matrix)
        p.archive_file(os.path.join(directory, 'demographic.json'),
                       name=id + '/demographic',
                       delete_original=True)
        p.archive_file(os.path.join(directory, 'matrix.csv'),
                       name=id + '/matrix',
                       delete_original=True)
Example #3
0
def test_archived_file_becoming_loaded_value_while_persisting_artifact_info(
        dbdiskrepo):
    tmp_dir = tempfile.mkdtemp('prov_integration_archive_test')
    data_filename = os.path.join(tmp_dir, 'data.csv')
    pd.DataFrame({
        'a': [0, 1, 2],
        'b': [10, 11, 12]
    }).to_csv(data_filename, index=False)
    archived_file = p.archive_file(data_filename, delete_original=True)

    @p.provenance(archive_file=True, delete_original_file=True)
    def add_col_c_ret_df(df):
        df['c'] = df['a'] + df['b']
        data_filename = os.path.join(tmp_dir, 'data2.csv')
        df.to_csv(data_filename, index=False)
        return data_filename

    read_csv = lambda af: pd.read_csv(str(af))

    df = archived_file.transform_value(read_csv)
    assert df.artifact.id == archived_file.artifact.id
    ret = add_col_c_ret_df(df).transform_value(read_csv)
    assert list(ret['c'].values) == [10, 12, 14]
    ar = ret.artifact
    assert ar.inputs['kargs']['df'].artifact.id == archived_file.artifact.id
Example #4
0
def test_archived_file_used_in_input(dbdiskrepo):
    assert p.get_default_repo() is not None
    tmp_dir = tempfile.mkdtemp('prov_integration_archive_test')
    data_filename = os.path.join(tmp_dir, 'data.csv')
    pd.DataFrame({
        'a': [0, 1, 2],
        'b': [10, 11, 12]
    }).to_csv(data_filename, index=False)

    assert os.path.exists(data_filename)
    archived_file = p.archive_file(data_filename,
                                   delete_original=True,
                                   custom_fields={'foo': 'bar'})
    assert not os.path.exists(data_filename)
    assert archived_file.artifact.custom_fields == {'foo': 'bar'}

    @p.provenance()
    def add_col_c_ret_df(filename):
        df = pd.read_csv(str(filename))
        df['c'] = df['a'] + df['b']
        return df

    ret = add_col_c_ret_df(archived_file)
    assert list(ret['c'].values) == [10, 12, 14]

    assert ret.artifact.inputs['kargs']['filename'] == archived_file
Example #5
0
def test_archived_file_cache_hits_when_filename_is_different(dbdiskrepo):
    repo = dbdiskrepo
    assert p.get_default_repo() is not None
    tmp_dir = tempfile.mkdtemp('prov_integration_archive_test')
    data_filename = os.path.join(tmp_dir, 'data.csv')
    pd.DataFrame({'a': [0, 1, 2], 'b': [10, 11, 12]}).\
        to_csv(data_filename, index=False)

    data_filename2 = os.path.join(tmp_dir, 'data2.csv')
    shutil.copyfile(data_filename, data_filename2)

    archived_file = p.archive_file(data_filename, delete_original=True)
    assert not os.path.exists(data_filename)
    archived_file2 = p.archive_file(data_filename2, delete_original=True)
    assert not os.path.exists(data_filename2)

    assert archived_file.artifact.id == archived_file2.artifact.id
Example #6
0
def test_archived_file_canonicalizes_file_extenstions(dbdiskrepo):
    assert p.get_default_repo() is not None
    tmp_dir = tempfile.mkdtemp('prov_integration_archive_test')
    data_filename = os.path.join(tmp_dir, 'foo.MPEG')
    spit(data_filename, 'blah')

    archived_file = p.archive_file(data_filename,
                                   delete_original=True,
                                   preserve_ext=True)

    assert archived_file.artifact.value_id.endswith('.mpg')
Example #7
0
def test_archived_file_allows_extensions_to_be_ignored(dbdiskrepo):
    repo = dbdiskrepo
    assert p.get_default_repo() is not None
    tmp_dir = tempfile.mkdtemp('prov_integration_archive_test')
    data_filename = os.path.join(tmp_dir, 'data.csv00')
    pd.DataFrame({'a': [0, 1, 2], 'b': [10, 11, 12]}).\
        to_csv(data_filename, index=False)

    archived_file = p.archive_file(data_filename, delete_original=True,
                                   preserve_ext=False)

    assert not archived_file.artifact.value_id.endswith('.csv')
Example #8
0
def test_output_is_archived_as_file(dbdiskrepo):
    repo = dbdiskrepo
    tmp_dir = tempfile.mkdtemp('prov_integration_archive_test')
    data_filename = os.path.join(tmp_dir, 'data.csv')
    pd.DataFrame({'a': [0,1,2], 'b': [10,11,12]}).\
        to_csv(data_filename, index=False)
    archived_file = p.archive_file(data_filename, delete_original=True)

    @p.provenance(archive_file=True, delete_original_file=True)
    def add_col_c_ret_df(filename):
        df = pd.read_csv(str(filename))
        df['c'] = df['a'] + df['b']
        data_filename = os.path.join(tmp_dir, 'data2.csv')
        df.to_csv(data_filename, index=False)
        return data_filename

    ret_file = add_col_c_ret_df(archived_file)
    ret = pd.read_csv(str(ret_file))
    assert list(ret['c'].values) == [10, 12, 14]