def test_search_2fields():
    dbi = DocumentIndex.from_folder(data_dir)
    dbi._make_relative_paths()

    query = pd.DataFrame([{'internal_id': 3},
                          {'internal_id': 1},
                          {'internal_id': 2}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [3, 1, 2])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    # make sure that if we have some additional field,
    # we still use the internal_id
    query = pd.DataFrame([{'internal_id': 1, 'a': 2},
                          {'internal_id': 2, 'b': 4},
                          {'internal_id': 1, 'a': 3}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    sres = dbi.search(query, drop=False)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id',
                               'a', 'b']))

    query = pd.DataFrame([{'file_path': "0.7.6.28637.txt"},
                          {'file_path': "0.7.47.117435.txt"}])
    del dbi.data['file_path']
    sres = dbi.search(query)
    query_res = [dbi.data.file_path.values.tolist().index(el)
                 for el in query.file_path.values]
    assert_array_equal(query_res, sres.internal_id)
def test_search_document_id():
    md = [{'file_path': '/test',  'document_id': 2},
          {'file_path': '/test2', 'document_id': 1},
          {'file_path': '/test3', 'document_id': 7},
          {'file_path': '/test8', 'document_id': 9},
          {'file_path': '/test9', 'document_id': 4}]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    dbi = DocumentIndex.from_list(md)
    dbi._make_relative_paths()
    query = pd.DataFrame([{'internal_id': 1},
                          {'internal_id': 2},
                          {'internal_id': 1}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    # make sure we use internal id first
    query = pd.DataFrame([{'internal_id': 1, 'document_id': 2},
                          {'internal_id': 2, 'document_id': 2},
                          {'internal_id': 1, 'document_id': 2}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])

    query = pd.DataFrame([{'document_id': 4},
                          {'document_id': 9},
                          {'document_id': 2}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [4, 3, 0])
def test_ingestion_metadata(n_fields):
    metadata = []
    for idx, fname in enumerate(fnames_in_abs):
        el = {'file_path': fname}
        if n_fields >= 2:
            el['document_id'] = 'a' + str(idx + 100)
        if n_fields >= 3:
            el['rendition_id'] = 1
        metadata.append(el)

    dbi = DocumentIndex.from_list(metadata)
    dbi._make_relative_paths()
    data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data

    if n_fields in [1, 2]:
        columns_ref = sorted(['file_path', 'document_id', 'internal_id'])
    elif n_fields == 3:
        columns_ref = sorted(['file_path', 'document_id', 'rendition_id',
                              'internal_id'])

    assert_array_equal(sorted(db.columns.values), columns_ref)
    assert_array_equal([os.path.normpath(os.path.join(data_dir_res, el))
                        for el in filenames],
                       [os.path.join(data_dir_res, el)
                        for el in db.file_path.values])
Beispiel #4
0
def test_remove_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_.data
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    dataset_definition = docs[['document_id']].to_dict(orient='records')
    fe.remove([dataset_definition[2], dataset_definition[4]])
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0] - 2
    assert fe.db_.data.shape[0] == db.shape[0] - 2
    assert len(fe.filenames_) == len(filenames) - 2

    dbn = fe.db_.data
    assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values,
                 dbn['document_id'].values)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples - 2

    fe.delete()
Beispiel #5
0
def test_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0]*2
    assert fe.db_.data.shape[0] == db.data.shape[0]*2
    assert len(fe.filenames_) == len(filenames)*2

    dbn = fe.db_.data
    assert_equal(dbn.iloc[:n_samples]['document_id'].values,
                 dbn.iloc[n_samples:]['document_id'].values - 10)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples * 2

    fe.delete()
def test_ingestion_pickling():
    from sklearn.externals import joblib
    db = DocumentIndex.from_folder(data_dir)
    fname = os.path.join(cache_dir, 'document_index')
    # check that db is picklable
    joblib.dump(db, fname)
    db2 = joblib.load(fname)
    os.remove(fname)
Beispiel #7
0
def test_search_not_found():
    dbi = DocumentIndex.from_folder(data_dir)
    query = pd.DataFrame([{
        'file_path': "DOES_NOT_EXISTS"
    }, {
        'file_path': "0.7.6.28637.txt"
    }])
    with pytest.raises(NotFound):
        sres = dbi.search(query)
Beispiel #8
0
def test_ingestion_base_dir():
    dbi = DocumentIndex.from_folder(data_dir)
    data_dir_res, filenames, db = dbi.data_dir, dbi.filenames, dbi.data
    assert data_dir_res == os.path.normpath(data_dir)
    assert_array_equal(db.columns.values, ['file_path', 'internal_id'])
    assert_array_equal(db.file_path.values, fnames_in)
    assert_array_equal(
        [os.path.normpath(el) for el in filenames],
        [os.path.join(data_dir_res, el) for el in db.file_path.values])
def test_bad_search_document_rendition_id():
    md = [{'file_path': '/test',  'document_id': 0, 'rendition_id': 0},
          {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1},
          {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0},
          {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0},
          {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}]
    for idx, el in enumerate(md):
        el['internal_id'] = idx

    # can always index with internal_id
    dbi = DocumentIndex.from_list(md)
    query = pd.DataFrame([{'internal_id': 1},
                          {'internal_id': 2},
                          {'document_id': 1}])
    with pytest.raises(NotFound):
        sres = dbi.search(query)
def test_ingestion_render(return_file_path):

    def _process_results(rd):
        rd = pd.DataFrame(rd)
        if return_file_path:
            assert 'file_path' in rd.columns
            del rd['file_path']
        return rd

    # make it a binary variable
    return_file_path = (return_file_path == 'return_file_path')

    md = [{'file_path': '/test',  'document_id': 2},
          {'file_path': '/test2', 'document_id': 1},
          {'file_path': '/test3', 'document_id': 7},
          {'file_path': '/test8', 'document_id': 9},
          {'file_path': '/test9', 'document_id': 4}]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    dbi = DocumentIndex.from_list(md)
    query = pd.DataFrame([{'a': 2, 'internal_id': 3},
                          {'a': 4, 'internal_id': 1}])
    res = pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9},
                        {'a': 4, 'internal_id': 1, 'document_id': 1}])

    rd = dbi.render_dict(query, return_file_path=return_file_path)
    rd = _process_results(rd)
    assert_frame_equal(rd, res)
    rd = dbi.render_dict(return_file_path=return_file_path)
    rd = _process_results(rd)
    assert_frame_equal(rd.loc[[0]],
                       pd.DataFrame([{'internal_id': 0, 'document_id': 2}]))
    assert len(rd) == len(md)

    rd = dbi.render_list(res, return_file_path=return_file_path)
    rd = _process_results(rd)
    assert sorted(rd.keys()) == sorted(['internal_id', 'document_id', 'a'])
    assert_frame_equal(pd.DataFrame(rd),
                       pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9},
                                     {'a': 4, 'internal_id': 1, 'document_id': 1}]))

    rd = dbi.render_list()
    assert sorted(rd.keys()) == sorted(['internal_id', 'document_id'])
def test_search_document_rendition_id():
    md = [{'file_path': '/test',  'document_id': 0, 'rendition_id': 0},
          {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1},
          {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0},
          {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0},
          {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    # can always index with internal_id
    dbi = DocumentIndex.from_list(md)
    dbi._make_relative_paths()

    query = pd.DataFrame([{'internal_id': 1},
                          {'internal_id': 2},
                          {'internal_id': 1}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path',
                               'document_id', 'rendition_id']))

    # the internal id is not sufficient to fully index documents in this case
    query = pd.DataFrame([{'document_id': 0},
                          {'document_id': 1},
                          {'document_id': 2}])
    with pytest.raises(ValueError):
        sres = dbi.search(query)

    query = pd.DataFrame([{'document_id': 0, 'rendition_id': 0},
                          {'document_id': 1, 'rendition_id': 0},
                          {'document_id': 2, 'rendition_id': 0}])

    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [0, 2, 3])