def test_incorrect_param(self, simil_df): with pytest.raises(ValueError): (SimilaritySelector(similarity='incorrect input').fit( simil_df['blocks'], simil_df['Ingrédients'])) model = SimilaritySelector(similarity='projection', projected_norm='incorrect input') with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients'])
def test_text_diff(self): voc = ['aa bb cc'] text = [['aa', 'aa', 'bb'], 'cc aa aa'] text_sub = pd.Series(['bb bb', ['aa']]) model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc)) diff = model.compute_diff(text, text_sub) target = csr_matrix([[1, 0, 0], [1, 0, 1]]) assert (diff.todense() == target.todense()).all()
def test_non_sparse_norm_type(self, simil_df): non_sparse_norm = partial(norm, axis=1, ord=1) model = SimilaritySelector( similarity='projection', projected_norm=non_sparse_norm, ) with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients'])
def test_hashing_type(self, simil_df): model = SimilaritySelector(count_vect_type='HashingVectorizer', ) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_predict_cosine(self, simil_df): model = SimilaritySelector(similarity='cosine', ) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_predict(self, simil_df): transformer = SimilaritySelector().fit(simil_df['blocks'], simil_df['Ingrédients']) test_blocks = [[ 'fabriqué en Italie', 'mélange de nougat', 'sucre, eau et betteraves' ]] assert (all( transformer.predict(test_blocks) == pd.Series( ['sucre, eau et betteraves'])))
def test_l_norm_values(self, simil_df): l2_norm = partial(sparse_norm, axis=1, ord=2) model = SimilaritySelector( similarity='projection', source_norm='l3', projected_norm=l2_norm, ) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_transform(self, simil_df): out_ds = (SimilaritySelector().fit(simil_df['blocks'], simil_df['Ingrédients']).predict( simil_df['blocks'])) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_empty_blocks(self, simil_df): X = simil_df.copy() X['blocks'].iloc[1] = [''] assert (SimilaritySelector().fit( X['blocks'], X['Ingrédients']).predict(X['blocks'])[1] == '') model = SimilaritySelector().fit(X['blocks'], X['Ingrédients']) model.predict([X['blocks'].iloc[0]]) assert (model.predict([['']]) == np.array([''])).all()
def test_cosine_with_score(self, simil_df): # absolute scoring model = SimilaritySelector(similarity='cosine', scoring='absolute_score') model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds) # relative scoring model = SimilaritySelector(similarity='cosine', scoring='relative_score') model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_score(self): # test absolute score voc = ['aa bb cc'] text = [['aa', 'aa', 'bb'], 'cc aa aa'] text_sub = pd.Series(['bb bb', ['bb']]) model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc)) target = np.array([[0., 1., 0.]]) assert (np.asarray(model.compute_score(text, text_sub)) == target).all() # test relative score with diff voc = ['aa bb cc dd ee'] docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee']) targ = pd.Series(['aa dd dd', ['aa dd ee']]) model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc)) target = np.array([[1., -1, 0., 1., 0.]]) scores = np.asarray(model.compute_score(docs, targ, kind='relative')) assert (target == scores).all() # test relative score with diff and HashingVectorizer voc = ['aa bb cc dd ee'] docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee']) targ = pd.Series(['aa dd dd', ['aa dd ee']]) model = SimilaritySelector(count_vect_type='HashingVectorizer') model.fit(pd.Series([voc]), pd.Series(voc)) scores = np.asarray(model.compute_score(docs, targ, kind='relative')) parms = model.source_count_vect.get_params() hasher_parms = { key: val for key, val in parms.items() if key in {'n_features', 'alternate_sign'} } hasher = FeatureHasher(**hasher_parms, input_type='dict') target = {'aa': 1., 'bb': -1., 'cc': 0., 'dd': 1., 'ee': 0.} target = np.asarray(hasher.fit_transform([target]).todense()) assert (target == scores).all()
def test_fit_predict(self, simil_df): model = SimilaritySelector() model.fit_predict(simil_df['blocks'], simil_df['Ingrédients'])
def test_predict_no_transform(self, simil_df): transformer = SimilaritySelector().fit(simil_df['blocks'], simil_df['Ingrédients']) assert (all( transformer.predict([['haricot', 'exploité en Inde']]) == pd.Series(['haricot'])))
def test_empty_ingred(self, simil_df): X = simil_df.copy() X['Ingrédients'].iloc[1] = np.nan (SimilaritySelector().fit(X['blocks'], X['Ingrédients']).predict(X['blocks']))
def test_predict_not_fitted(self): with pytest.raises(NotFittedError): SimilaritySelector().predict([['1', '2']])
def test_count_vect_kwargs(self, simil_df): model = SimilaritySelector(count_vect_kwargs={'binary': True}) model.fit(simil_df['blocks'], simil_df['Ingrédients']) model = SimilaritySelector(count_vect_kwargs={'incorrect': True}) with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients']) model = (SimilaritySelector( count_vect_kwargs={'strip_accents': 'incorrect'})) with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients'])
def test_embedding(self, simil_df): with pytest.raises(ValueError): (SimilaritySelector(embedding_method='incorrect').fit( simil_df['blocks'], simil_df['Ingrédients'])) model = SimilaritySelector(embedding_method='Word2Vec', count_vect_kwargs={'stop_words': {'de'}}) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds) embed_parms = {'n_components': 5} model = SimilaritySelector(embedding_method='tSVD', embedding_parms=embed_parms) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) assert pd.Series(out_ds).equals(target_ds)
def test_base(self, simil_df): transformer = SimilaritySelector() transformer.fit(simil_df['blocks'], simil_df['Ingrédients'])