Esempio n. 1
0
 def test_get_co(self):
     entity = 'title'
     cos, t2i, i2t = get_co(
         [self.sample_text, self.sample_text, self.sample_text], 20)
     nf_texts, cnt = preproc.preprocess(texts=[self.sample_text],
                                        remove_shortwords=True,
                                        remove_stopwords=True)
     nf_text = nf_texts[0]
     nf_cos, nft2i, nfi2t = get_co([nf_text] * 3, 20)
     fn_texts, cnt = preproc.preprocess(texts=[self.sample_text],
                                        remove_shortwords=True,
                                        remove_stopwords=True)
     fn_text = fn_texts[0]
     fn_cos, fnt2i, fni2t = get_co([fn_text] * 5, 20)
     print('Sample english co terms: {}'.format(fn_cos))
     assert 'skilled' in t2i
     assert 'skilled' in nft2i
     assert 'skilled' in fnt2i
Esempio n. 2
0
 def test_unbiased_dice_co_symmetry(self):
     th = 0.0
     entity = r'powder'
     crp, _ = preproc.preprocess([self.cinnamon_text.lower()],
                                 remove_shortwords=True)
     w = 20
     cos, t2i, i2t = get_co(crp, w, method='unbiased_dice', threshold=th)
     co_terms_e = get_co_tokens(cos, t2i, i2t, entity)
     for term in co_terms_e:
         co_terms_t = get_co_tokens(cos, t2i, i2t, term)
         assert math.isclose(co_terms_e[term], co_terms_t[entity])
Esempio n. 3
0
 def test_relevant_words_proximity(self):
     entity = 'powder'
     w = 20
     crp, _ = preproc.preprocess(texts=[self.cinnamon_text.lower()],
                                 remove_shortwords=True)
     cinnamon_text = crp[0]
     texts_tokens_iter, token2ind, all_tokens_counter, len_texts = \
         texts2tokens(texts_or_path=[cinnamon_text])
     relevant_words = get_t2t_proximities(next(texts_tokens_iter),
                                          token2ind,
                                          w,
                                          return_dict=True)
     relevant_words = relevant_words[entity]
     assert len(relevant_words) <= 2*w*cinnamon_text.count(entity), \
         (len(relevant_words), cinnamon_text.count(entity))
Esempio n. 4
0
 def test_relevant_words_symmetry(self):
     entity = 'title'
     crp, _ = preproc.preprocess(texts=[self.sample_text.lower()],
                                 remove_shortwords=True)
     sample_text = crp[0]
     w = 20
     texts_tokens_iter, token2ind, all_tokens_counter, len_texts = \
         texts2tokens(texts_or_path=[sample_text])
     relevant_words_score = get_t2t_proximities(next(texts_tokens_iter),
                                                token2ind,
                                                w,
                                                return_dict=True,
                                                proximity_func=lambda x:
                                                (w - abs(x) + 0.5) * 2 / w)
     for rel_word in relevant_words_score[entity]:
         assert (relevant_words_score[entity][rel_word] ==
                 relevant_words_score[rel_word][entity])
Esempio n. 5
0
 def test_unbiased_dice_co_triplicate_docs(self):
     entity = 'powder'
     crp, _ = preproc.preprocess([self.cinnamon_text.lower()],
                                 remove_shortwords=True)
     w = 20
     cos, t2i, i2t = get_co(crp, w, method='unbiased_dice', threshold=0.0)
     co_terms1 = get_co_tokens(cos, t2i, i2t, entity)
     cos, t2i, i2t = get_co(crp * 10,
                            w,
                            method='unbiased_dice',
                            threshold=0.0)
     co_terms10 = get_co_tokens(cos, t2i, i2t, entity)
     ratios = []
     for term in co_terms1:
         print(term, co_terms10[term], co_terms1[term],
               co_terms10[term] / co_terms1[term])
         ratios.append(co_terms10[term] / co_terms1[term])
         assert 0.8 < co_terms10[term] / co_terms1[term] < 1.2