Exemple #1
0
def test_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path',
                               'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0] * 2
    assert fe.db_.data.shape[0] == db.data.shape[0] * 2
    assert len(fe.filenames_) == len(filenames) * 2

    dbn = fe.db_.data
    assert_equal(dbn.iloc[:n_samples]['document_id'].values,
                 dbn.iloc[n_samples:]['document_id'].values - 10)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples * 2

    fe.delete()
def test_lsi_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    X_lsi = lsi._load_features()
    n_samples = fe.n_samples_

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)

    X_lsi_new = lsi._load_features()
    assert X_lsi_new.shape[0] == X_lsi.shape[0]*2
    assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])