Esempio n. 1
0
 def test_incorrect_param(self, simil_df):
     with pytest.raises(ValueError):
         (SimilaritySelector(similarity='incorrect input').fit(
             simil_df['blocks'], simil_df['Ingrédients']))
     model = SimilaritySelector(similarity='projection',
                                projected_norm='incorrect input')
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
Esempio n. 2
0
 def test_text_diff(self):
     voc = ['aa bb cc']
     text = [['aa', 'aa', 'bb'], 'cc aa aa']
     text_sub = pd.Series(['bb bb', ['aa']])
     model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc))
     diff = model.compute_diff(text, text_sub)
     target = csr_matrix([[1, 0, 0], [1, 0, 1]])
     assert (diff.todense() == target.todense()).all()
Esempio n. 3
0
 def test_non_sparse_norm_type(self, simil_df):
     non_sparse_norm = partial(norm, axis=1, ord=1)
     model = SimilaritySelector(
         similarity='projection',
         projected_norm=non_sparse_norm,
     )
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
Esempio n. 4
0
 def test_hashing_type(self, simil_df):
     model = SimilaritySelector(count_vect_type='HashingVectorizer', )
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 5
0
 def test_predict_cosine(self, simil_df):
     model = SimilaritySelector(similarity='cosine', )
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 6
0
 def test_predict(self, simil_df):
     transformer = SimilaritySelector().fit(simil_df['blocks'],
                                            simil_df['Ingrédients'])
     test_blocks = [[
         'fabriqué en Italie', 'mélange de nougat',
         'sucre, eau et betteraves'
     ]]
     assert (all(
         transformer.predict(test_blocks) == pd.Series(
             ['sucre, eau et betteraves'])))
Esempio n. 7
0
 def test_l_norm_values(self, simil_df):
     l2_norm = partial(sparse_norm, axis=1, ord=2)
     model = SimilaritySelector(
         similarity='projection',
         source_norm='l3',
         projected_norm=l2_norm,
     )
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 8
0
 def test_transform(self, simil_df):
     out_ds = (SimilaritySelector().fit(simil_df['blocks'],
                                        simil_df['Ingrédients']).predict(
                                            simil_df['blocks']))
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 9
0
 def test_empty_blocks(self, simil_df):
     X = simil_df.copy()
     X['blocks'].iloc[1] = ['']
     assert (SimilaritySelector().fit(
         X['blocks'], X['Ingrédients']).predict(X['blocks'])[1] == '')
     model = SimilaritySelector().fit(X['blocks'], X['Ingrédients'])
     model.predict([X['blocks'].iloc[0]])
     assert (model.predict([['']]) == np.array([''])).all()
Esempio n. 10
0
 def test_cosine_with_score(self, simil_df):
     # absolute scoring
     model = SimilaritySelector(similarity='cosine',
                                scoring='absolute_score')
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
     # relative scoring
     model = SimilaritySelector(similarity='cosine',
                                scoring='relative_score')
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 11
0
 def test_score(self):
     # test absolute score
     voc = ['aa bb cc']
     text = [['aa', 'aa', 'bb'], 'cc aa aa']
     text_sub = pd.Series(['bb bb', ['bb']])
     model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc))
     target = np.array([[0., 1., 0.]])
     assert (np.asarray(model.compute_score(text,
                                            text_sub)) == target).all()
     # test relative score with diff
     voc = ['aa bb cc dd ee']
     docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee'])
     targ = pd.Series(['aa dd dd', ['aa dd ee']])
     model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc))
     target = np.array([[1., -1, 0., 1., 0.]])
     scores = np.asarray(model.compute_score(docs, targ, kind='relative'))
     assert (target == scores).all()
     # test relative score with diff and HashingVectorizer
     voc = ['aa bb cc dd ee']
     docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee'])
     targ = pd.Series(['aa dd dd', ['aa dd ee']])
     model = SimilaritySelector(count_vect_type='HashingVectorizer')
     model.fit(pd.Series([voc]), pd.Series(voc))
     scores = np.asarray(model.compute_score(docs, targ, kind='relative'))
     parms = model.source_count_vect.get_params()
     hasher_parms = {
         key: val
         for key, val in parms.items()
         if key in {'n_features', 'alternate_sign'}
     }
     hasher = FeatureHasher(**hasher_parms, input_type='dict')
     target = {'aa': 1., 'bb': -1., 'cc': 0., 'dd': 1., 'ee': 0.}
     target = np.asarray(hasher.fit_transform([target]).todense())
     assert (target == scores).all()
Esempio n. 12
0
 def test_fit_predict(self, simil_df):
     model = SimilaritySelector()
     model.fit_predict(simil_df['blocks'], simil_df['Ingrédients'])
Esempio n. 13
0
 def test_predict_no_transform(self, simil_df):
     transformer = SimilaritySelector().fit(simil_df['blocks'],
                                            simil_df['Ingrédients'])
     assert (all(
         transformer.predict([['haricot', 'exploité en Inde']]) ==
         pd.Series(['haricot'])))
Esempio n. 14
0
 def test_empty_ingred(self, simil_df):
     X = simil_df.copy()
     X['Ingrédients'].iloc[1] = np.nan
     (SimilaritySelector().fit(X['blocks'],
                               X['Ingrédients']).predict(X['blocks']))
Esempio n. 15
0
 def test_predict_not_fitted(self):
     with pytest.raises(NotFittedError):
         SimilaritySelector().predict([['1', '2']])
Esempio n. 16
0
 def test_count_vect_kwargs(self, simil_df):
     model = SimilaritySelector(count_vect_kwargs={'binary': True})
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     model = SimilaritySelector(count_vect_kwargs={'incorrect': True})
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     model = (SimilaritySelector(
         count_vect_kwargs={'strip_accents': 'incorrect'}))
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
Esempio n. 17
0
 def test_embedding(self, simil_df):
     with pytest.raises(ValueError):
         (SimilaritySelector(embedding_method='incorrect').fit(
             simil_df['blocks'], simil_df['Ingrédients']))
     model = SimilaritySelector(embedding_method='Word2Vec',
                                count_vect_kwargs={'stop_words': {'de'}})
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
     embed_parms = {'n_components': 5}
     model = SimilaritySelector(embedding_method='tSVD',
                                embedding_parms=embed_parms)
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 18
0
 def test_base(self, simil_df):
     transformer = SimilaritySelector()
     transformer.fit(simil_df['blocks'], simil_df['Ingrédients'])