def test_retrieve_also_unigrams(self): """ Passing min_count=1, all the ngrams >= bigram should be returned. """ text = SemanticalTagger("Une phrase avec un mot dingue. " "Une autre phrase avec le même mot dingue.") # We should get the same than by default, plus the # non stop words, so 5 + [phrase, mot, dingue] = 8 ngrams = text.ngrams(min_length=1) self.assertEqual(len(ngrams), 8)
def test_should_not_return_ngrams_longer_than_max_length(self): """ Passing min_count=1, all the ngrams >= bigram should be returned. """ text = SemanticalTagger("Une phrase avec un mot dingue. " "Une autre phrase avec le même mot dingue.") # We should only get: # - phrase, avec, un, mot # - mot, dingue ngrams = text.ngrams(max_length=4) self.assertEqual(len(ngrams), 2)
def test_default(self): text = SemanticalTagger("Une phrase avec un mot dingue. " "Une autre phrase avec le même mot dingue.") # We should get (stop words at end or beginning are skipped): expected_ngrams = set([ (u"phrase", u"avec", u"un", u"mot", u"dingue"), (u"phrase", u"avec", u"un", u"mot"), (u"phrase", u"avec", u"le", u"même", u"mot", u"dingue"), (u"phrase", u"avec", u"le", u"même", u"mot"), (u"mot", u"dingue"), ]) ngrams = text.ngrams() self.assertEqual(len(ngrams), 5) flat_ngrams = set() for ngram in ngrams: flat_ngrams.add( tuple(stemm.main_occurrence.lemme for stemm in ngram)) self.assertEqual(expected_ngrams, flat_ngrams)
def train(self, inst): if isinstance(inst, (int, str)): # We guess we have a pk here inst = config.content_model_getter(inst) text = getattr(inst, config.SULCI_CONTENT_PROPERTY) try: S = SemanticalTagger( text, thesaurus=self.thesaurus, pos_tagger=self.pos_tagger, lexicon=self.pos_tagger.lexicon ) S.deduplicate_keyentities() # During lairning, try to filter except ValueError: # SemanticalTagger raise ValueError if text is empty return # We want also the unigrams # Note that the stopwords will not be returned ngrams = S.ngrams(min_length=1, max_length=5) for key, values in ngrams.iteritems(): self.global_pmi.add_ngram(values['stemms'], amount=values['count'])