Example #1
0
 def test_retrieve_also_unigrams(self):
     """
     Passing min_count=1, all the ngrams >= bigram should be returned.
     """
     text = SemanticalTagger("Une phrase avec un mot dingue. "
                             "Une autre phrase avec le même mot dingue.")
     # We should get the same than by default, plus the
     # non stop words, so 5 + [phrase, mot, dingue] = 8
     ngrams = text.ngrams(min_length=1)
     self.assertEqual(len(ngrams), 8)
Example #2
0
 def test_should_not_return_ngrams_longer_than_max_length(self):
     """
     Passing min_count=1, all the ngrams >= bigram should be returned.
     """
     text = SemanticalTagger("Une phrase avec un mot dingue. "
                             "Une autre phrase avec le même mot dingue.")
     # We should only get:
     # - phrase, avec, un, mot
     # - mot, dingue
     ngrams = text.ngrams(max_length=4)
     self.assertEqual(len(ngrams), 2)
Example #3
0
 def test_default(self):
     text = SemanticalTagger("Une phrase avec un mot dingue. "
                             "Une autre phrase avec le même mot dingue.")
     # We should get (stop words at end or beginning are skipped):
     expected_ngrams = set([
         (u"phrase", u"avec", u"un", u"mot", u"dingue"),
         (u"phrase", u"avec", u"un", u"mot"),
         (u"phrase", u"avec", u"le", u"même", u"mot", u"dingue"),
         (u"phrase", u"avec", u"le", u"même", u"mot"),
         (u"mot", u"dingue"),
     ])
     ngrams = text.ngrams()
     self.assertEqual(len(ngrams), 5)
     flat_ngrams = set()
     for ngram in ngrams:
         flat_ngrams.add(
             tuple(stemm.main_occurrence.lemme for stemm in ngram))
     self.assertEqual(expected_ngrams, flat_ngrams)
 def train(self, inst):
     if isinstance(inst, (int, str)):
         # We guess we have a pk here
         inst = config.content_model_getter(inst)
     text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
     try:
         S = SemanticalTagger(
             text,
             thesaurus=self.thesaurus,
             pos_tagger=self.pos_tagger,
             lexicon=self.pos_tagger.lexicon
         )
         S.deduplicate_keyentities()  # During lairning, try to filter
     except ValueError:
         # SemanticalTagger raise ValueError if text is empty
         return
     # We want also the unigrams
     # Note that the stopwords will not be returned
     ngrams = S.ngrams(min_length=1, max_length=5)
     for key, values in ngrams.iteritems():
         self.global_pmi.add_ngram(values['stemms'], amount=values['count'])