def test_score(self): # test absolute score voc = ['aa bb cc'] text = [['aa', 'aa', 'bb'], 'cc aa aa'] text_sub = pd.Series(['bb bb', ['bb']]) model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc)) target = np.array([[0., 1., 0.]]) assert (np.asarray(model.compute_score(text, text_sub)) == target).all() # test relative score with diff voc = ['aa bb cc dd ee'] docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee']) targ = pd.Series(['aa dd dd', ['aa dd ee']]) model = SimilaritySelector().fit(pd.Series([voc]), pd.Series(voc)) target = np.array([[1., -1, 0., 1., 0.]]) scores = np.asarray(model.compute_score(docs, targ, kind='relative')) assert (target == scores).all() # test relative score with diff and HashingVectorizer voc = ['aa bb cc dd ee'] docs = pd.Series([['dd', 'dd', 'bb'], 'bb dd ee ee']) targ = pd.Series(['aa dd dd', ['aa dd ee']]) model = SimilaritySelector(count_vect_type='HashingVectorizer') model.fit(pd.Series([voc]), pd.Series(voc)) scores = np.asarray(model.compute_score(docs, targ, kind='relative')) parms = model.source_count_vect.get_params() hasher_parms = { key: val for key, val in parms.items() if key in {'n_features', 'alternate_sign'} } hasher = FeatureHasher(**hasher_parms, input_type='dict') target = {'aa': 1., 'bb': -1., 'cc': 0., 'dd': 1., 'ee': 0.} target = np.asarray(hasher.fit_transform([target]).todense()) assert (target == scores).all()
def test_non_sparse_norm_type(self, simil_df): non_sparse_norm = partial(norm, axis=1, ord=1) model = SimilaritySelector( similarity='projection', projected_norm=non_sparse_norm, ) with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients'])
def test_incorrect_param(self, simil_df): with pytest.raises(ValueError): (SimilaritySelector(similarity='incorrect input').fit( simil_df['blocks'], simil_df['Ingrédients'])) model = SimilaritySelector(similarity='projection', projected_norm='incorrect input') with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients'])
def test_hashing_type(self, simil_df): model = SimilaritySelector(count_vect_type='HashingVectorizer', ) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_predict_cosine(self, simil_df): model = SimilaritySelector(similarity='cosine', ) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_l_norm_values(self, simil_df): l2_norm = partial(sparse_norm, axis=1, ord=2) model = SimilaritySelector( similarity='projection', source_norm='l3', projected_norm=l2_norm, ) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_embedding(self, simil_df): with pytest.raises(ValueError): (SimilaritySelector(embedding_method='incorrect').fit( simil_df['blocks'], simil_df['Ingrédients'])) model = SimilaritySelector(embedding_method='Word2Vec', count_vect_kwargs={'stop_words': {'de'}}) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds) embed_parms = {'n_components': 5} model = SimilaritySelector(embedding_method='tSVD', embedding_parms=embed_parms) model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) assert pd.Series(out_ds).equals(target_ds)
def test_cosine_with_score(self, simil_df): # absolute scoring model = SimilaritySelector(similarity='cosine', scoring='absolute_score') model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds) # relative scoring model = SimilaritySelector(similarity='cosine', scoring='relative_score') model.fit(simil_df['blocks'], simil_df['Ingrédients']) out_ds = model.predict(simil_df['blocks']) target_data = ['100% sucre', 'E110, farine', 'haricots'] target_ds = pd.Series( target_data, simil_df.index, ) assert pd.Series(out_ds).equals(target_ds)
def test_count_vect_kwargs(self, simil_df): model = SimilaritySelector(count_vect_kwargs={'binary': True}) model.fit(simil_df['blocks'], simil_df['Ingrédients']) model = SimilaritySelector(count_vect_kwargs={'incorrect': True}) with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients']) model = (SimilaritySelector( count_vect_kwargs={'strip_accents': 'incorrect'})) with pytest.raises(ValueError): model.fit(simil_df['blocks'], simil_df['Ingrédients'])
def test_base(self, simil_df): transformer = SimilaritySelector() transformer.fit(simil_df['blocks'], simil_df['Ingrédients'])