def test_api_dupdetection(app, kind, options): if kind == 'simhash': try: import simhash except ImportError: raise SkipTest dsid, pars, _ = get_features_cached(app, hashed=False) method = V01 + "/feature-extraction/{}".format(dsid) data = app.get_check(method) url = V01 + "/duplicate-detection" pars = {'parent_id': dsid, 'method': kind} data = app.post_check(url, json=pars) assert dict2type(data) == {'id': 'str'} mid = data['id'] url += '/{}'.format(mid) data = app.get_check(url, query_string=options) assert dict2type(data, max_depth=1) == {'data': 'list'} for row in data['data']: assert dict2type(row, max_depth=1) == {'cluster_id': 'int', 'cluster_similarity': 'float', 'documents': 'list'} app.delete_check(url)
def test_dictkey2type(): from freediscovery.utils import dict2type assert dict2type('djsk') == 'str' assert dict2type(['t', 1]) == ['str', 'int'] assert dict2type({'t': {'b': 0.1}}) == {'t': {'b': 'float'}} # make sure we don't change the original object x = {'x': {'a': 3}} dict2type(x) assert x == {'x': {'a': 3}}
def test_stop_words_integration(app, hashed): url = V01 + '/stop-words/' sw_name = 'test1w' pars = {'name': sw_name, 'stop_words': ['and', 'or', 'in']} res = app.post_check(url, json=pars) assert dict2type(res, collapse_lists=True) == {'name': 'str'} assert res['name'] == sw_name res = app.get_check(url + sw_name) assert dict2type(res, collapse_lists=True) == {'name': 'str', 'stop_words': ['str']} assert res['name'] == sw_name assert res['stop_words'] == pars['stop_words'] dsid, pars, _ = get_features(app, hashed=hashed, stop_words=sw_name)
def test_get_search_filenames(app): dsid, _, _ = get_features_cached(app) method = V01 + "/feature-extraction/{}/id-mapping".format(dsid) def _filter_dict(x, filter_field): return { key: val for key, val in list(x.items()) if key == filter_field } response_ref = { 'internal_id': 'int', 'file_path': 'str', 'document_id': 'int' } # Query 1 file_path_obj = [{'file_path': val} for val in ['00401.txt', '00506.txt']] data = app.post_check(method, json={'data': file_path_obj}) data = data['data'] for idx in range(len(data)): assert dict2type(data[idx]) == response_ref assert [_filter_dict(row, 'file_path') for row in data] == file_path_obj assert_equal( np.asarray([row['internal_id'] for row in data])**2, [row['document_id'] for row in data]) with pytest.raises(NotFound): res = app.post(method, json={'data': [{'file_path': '00400.txt'}]}) # Query 2 file_path_obj = [{'document_id': 4}, {'document_id': 9}] data = app.post_check(method, json={'data': file_path_obj}) data = data['data'] for idx in range(len(data)): assert dict2type(data[idx]) == response_ref assert [_filter_dict(row, 'document_id') for row in data] == file_path_obj assert_equal( np.asarray([row['internal_id'] for row in data])**2, [row['document_id'] for row in data])
def test_append_documents(app, ingestion_method): method = V01 + "/feature-extraction/" data = app.post_check(method) dsid = data['id'] method += dsid app.post_check(method, json={'data_dir': data_dir}) data = app.get_check(method) # check that the file_path is correctly returned by the id-mapping data = app.post_check(method + '/id-mapping', json={'return_file_path': False}) assert dict2type(data['data'][0]) == \ {'document_id': 'int', 'internal_id': 'int'} data = app.post_check(method + '/id-mapping', json={'return_file_path': True}) assert dict2type(data['data'][0]) == \ {'document_id': 'int', 'file_path': 'str', 'internal_id': 'int'} db_old = data['data'] dataset_definition = [] for idx, row in enumerate(db_old): row_out = {'document_id': idx + 10} if ingestion_method == 'file_path': row_out['file_path'] = os.path.join(data_dir, row['file_path']) elif ingestion_method == 'content': with Path(data_dir, row['file_path']).open('rt', encoding='utf-8') as fh: row_out['content'] = fh.read() dataset_definition.append(row_out) app.post_check(method + '/append', json={'dataset_definition': dataset_definition}) data = app.post_check(method + '/id-mapping', json={'return_file_path': True}) db_old = pd.DataFrame(db_old) db_new = pd.DataFrame(data['data']) assert db_old.shape[0] * 2 == db_new.shape[0]
def test_get_feature_extraction(app, hashed): dsid, _, _ = get_features_cached(app, hashed=hashed) method = V01 + "/feature-extraction/{}".format(dsid) data = app.get_check(method) assert dict2type(data, collapse_lists=True) == {'analyzer': 'str', 'ngram_range': ['int'], 'stop_words': 'NoneType', 'n_jobs': 'int', 'chunk_size': 'int', 'norm': 'str', 'data_dir': 'str', 'n_samples': 'int', 'n_features': 'int', 'use_idf': 'bool', 'binary': 'bool', 'sublinear_tf': 'bool', 'use_hashing': 'bool', 'filenames': ['str'], 'max_df': 'float', 'min_df': 'float', 'parse_email_headers': 'bool', 'n_samples_processed': 'int'}
def test_stop_words(app): name = "test_acstw" tested_stop_words = ['one', 'two', 'three', 'foure', 'five', 'six'] method = V01 + "/stop-words/" pars = dict(name=name, stop_words=tested_stop_words) data = app.post_check(method, json=pars) method = V01 + "/stop-words/{}".format(name) data = app.get_check(method) assert dict2type(data, collapse_lists=True) == { 'name': 'str', 'stop_words': ['str'] } assert data["stop_words"] == tested_stop_words method = V01 + "/stop-words/{}".format(name) app.delete_check(method)
def test_get_feature_extraction(app, hashed, weighting): norm_alpha = 0.5 dsid, _, _ = get_features_cached(app, hashed=hashed, weighting=weighting, norm_alpha=norm_alpha) method = V01 + "/feature-extraction/{}".format(dsid) data = app.get_check(method) assert dict2type(data, collapse_lists=True) == { 'analyzer': 'str', 'ngram_range': ['int'], 'stop_words': 'str', 'n_jobs': 'int', 'chunk_size': 'int', 'data_dir': 'str', 'n_samples': 'int', 'n_features': 'int', 'weighting': 'str', 'norm_alpha': 'float', 'use_hashing': 'bool', 'filenames': ['str'], 'max_df': 'float', 'min_df': 'float', 'parse_email_headers': 'bool', 'n_samples_processed': 'int', 'preprocess': [], 'column_ids': 'NoneType', 'column_separator': 'str' } assert data['use_hashing'] == hashed assert data['weighting'] == weighting assert data['norm_alpha'] == norm_alpha vect = joblib.load( os.path.join(CACHE_DIR, 'ediscovery_cache', dsid, 'vectorizer')) assert (data['use_hashing'] is True) == ('hashing' in type(vect).__name__.lower())