Esempio n. 1
0
 def test_score(self):
     # test absolute score
     voc = ['aa bb cc']
     text = [['aa', 'aa', 'bb'], 'cc aa aa']
     text_sub = pd.Series(['bb bb', ['bb']])
     model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc))
     target = np.array([[0., 1., 0.]])
     assert (np.asarray(model.compute_score(text,
                                            text_sub)) == target).all()
     # test relative score with diff
     voc = ['aa bb cc dd ee']
     docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee'])
     targ = pd.Series(['aa dd dd', ['aa dd ee']])
     model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc))
     target = np.array([[1., -1, 0., 1., 0.]])
     scores = np.asarray(model.compute_score(docs, targ, kind='relative'))
     assert (target == scores).all()
     # test relative score with diff and HashingVectorizer
     voc = ['aa bb cc dd ee']
     docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee'])
     targ = pd.Series(['aa dd dd', ['aa dd ee']])
     model = SimilaritySelector(count_vect_type='HashingVectorizer')
     model.fit(pd.Series([voc]), pd.Series(voc))
     scores = np.asarray(model.compute_score(docs, targ, kind='relative'))
     parms = model.source_count_vect.get_params()
     hasher_parms = {
         key: val
         for key, val in parms.items()
         if key in {'n_features', 'alternate_sign'}
     }
     hasher = FeatureHasher(**hasher_parms, input_type='dict')
     target = {'aa': 1., 'bb': -1., 'cc': 0., 'dd': 1., 'ee': 0.}
     target = np.asarray(hasher.fit_transform([target]).todense())
     assert (target == scores).all()
Esempio n. 2
0
 def test_non_sparse_norm_type(self, simil_df):
     non_sparse_norm = partial(norm, axis=1, ord=1)
     model = SimilaritySelector(
         similarity='projection',
         projected_norm=non_sparse_norm,
     )
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
Esempio n. 3
0
 def test_incorrect_param(self, simil_df):
     with pytest.raises(ValueError):
         (SimilaritySelector(similarity='incorrect input').fit(
             simil_df['blocks'], simil_df['Ingrédients']))
     model = SimilaritySelector(similarity='projection',
                                projected_norm='incorrect input')
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
Esempio n. 4
0
 def test_hashing_type(self, simil_df):
     model = SimilaritySelector(count_vect_type='HashingVectorizer', )
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 5
0
 def test_predict_cosine(self, simil_df):
     model = SimilaritySelector(similarity='cosine', )
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 6
0
 def test_l_norm_values(self, simil_df):
     l2_norm = partial(sparse_norm, axis=1, ord=2)
     model = SimilaritySelector(
         similarity='projection',
         source_norm='l3',
         projected_norm=l2_norm,
     )
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 7
0
 def test_embedding(self, simil_df):
     with pytest.raises(ValueError):
         (SimilaritySelector(embedding_method='incorrect').fit(
             simil_df['blocks'], simil_df['Ingrédients']))
     model = SimilaritySelector(embedding_method='Word2Vec',
                                count_vect_kwargs={'stop_words': {'de'}})
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
     embed_parms = {'n_components': 5}
     model = SimilaritySelector(embedding_method='tSVD',
                                embedding_parms=embed_parms)
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 8
0
 def test_cosine_with_score(self, simil_df):
     # absolute scoring
     model = SimilaritySelector(similarity='cosine',
                                scoring='absolute_score')
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
     # relative scoring
     model = SimilaritySelector(similarity='cosine',
                                scoring='relative_score')
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     out_ds = model.predict(simil_df['blocks'])
     target_data = ['100% sucre', 'E110, farine', 'haricots']
     target_ds = pd.Series(
         target_data,
         simil_df.index,
     )
     assert pd.Series(out_ds).equals(target_ds)
Esempio n. 9
0
 def test_count_vect_kwargs(self, simil_df):
     model = SimilaritySelector(count_vect_kwargs={'binary': True})
     model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     model = SimilaritySelector(count_vect_kwargs={'incorrect': True})
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
     model = (SimilaritySelector(
         count_vect_kwargs={'strip_accents': 'incorrect'}))
     with pytest.raises(ValueError):
         model.fit(simil_df['blocks'], simil_df['Ingrédients'])
Esempio n. 10
0
 def test_base(self, simil_df):
     transformer = SimilaritySelector()
     transformer.fit(simil_df['blocks'], simil_df['Ingrédients'])