Esempio n. 1
0
def test_search_2fields():
    dbi = DocumentIndex.from_folder(data_dir)
    dbi._make_relative_paths()

    query = pd.DataFrame([{'internal_id': 3},
                          {'internal_id': 1},
                          {'internal_id': 2}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [3, 1, 2])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    # make sure that if we have some additional field,
    # we still use the internal_id
    query = pd.DataFrame([{'internal_id': 1, 'a': 2},
                          {'internal_id': 2, 'b': 4},
                          {'internal_id': 1, 'a': 3}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    sres = dbi.search(query, drop=False)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id',
                               'a', 'b']))

    query = pd.DataFrame([{'file_path': "0.7.6.28637.txt"},
                          {'file_path': "0.7.47.117435.txt"}])
    del dbi.data['file_path']
    sres = dbi.search(query)
    query_res = [dbi.data.file_path.values.tolist().index(el)
                 for el in query.file_path.values]
    assert_array_equal(query_res, sres.internal_id)
Esempio n. 2
0
def test_remove_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_.data
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    dataset_definition = docs[['document_id']].to_dict(orient='records')
    fe.remove([dataset_definition[2], dataset_definition[4]])
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0] - 2
    assert fe.db_.data.shape[0] == db.shape[0] - 2
    assert len(fe.filenames_) == len(filenames) - 2

    dbn = fe.db_.data
    assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values,
                 dbn['document_id'].values)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples - 2

    fe.delete()
Esempio n. 3
0
def test_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0]*2
    assert fe.db_.data.shape[0] == db.data.shape[0]*2
    assert len(fe.filenames_) == len(filenames)*2

    dbn = fe.db_.data
    assert_equal(dbn.iloc[:n_samples]['document_id'].values,
                 dbn.iloc[n_samples:]['document_id'].values - 10)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples * 2

    fe.delete()
Esempio n. 4
0
def test_ingestion_pickling():
    from sklearn.externals import joblib
    db = DocumentIndex.from_folder(data_dir)
    fname = os.path.join(cache_dir, 'document_index')
    # check that db is picklable
    joblib.dump(db, fname)
    db2 = joblib.load(fname)
    os.remove(fname)
Esempio n. 5
0
def test_search_not_found():
    dbi = DocumentIndex.from_folder(data_dir)
    query = pd.DataFrame([{
        'file_path': "DOES_NOT_EXISTS"
    }, {
        'file_path': "0.7.6.28637.txt"
    }])
    with pytest.raises(NotFound):
        sres = dbi.search(query)
Esempio n. 6
0
def test_ingestion_base_dir():
    dbi = DocumentIndex.from_folder(data_dir)
    data_dir_res, filenames, db = dbi.data_dir, dbi.filenames, dbi.data
    assert data_dir_res == os.path.normpath(data_dir)
    assert_array_equal(db.columns.values, ['file_path', 'internal_id'])
    assert_array_equal(db.file_path.values, fnames_in)
    assert_array_equal(
        [os.path.normpath(el) for el in filenames],
        [os.path.join(data_dir_res, el) for el in db.file_path.values])