Ejemplo n.º 1
0
 def test_default(self):
     text = SemanticalTagger("Une phrase avec un mot dingue. "
                             "Une autre phrase avec le même mot dingue.")
     # We should get "mot dingue", who occurs twice:
     ngrams = text.filtered_ngrams()
     self.assertEqual(len(ngrams), 1)
     self.assertEqual(
         tuple(stemm.main_occurrence.lemme for stemm in ngrams[0][0]),
         (u"mot", u"dingue"))
Ejemplo n.º 2
0
 def test_retrieve_also_unigrams(self):
     """
     Passing min_count=1, all the ngrams >= bigram should be returned.
     """
     text = SemanticalTagger("Une phrase avec un mot dingue. "
                             "Une autre phrase avec le même mot dingue.")
     # We should get the same than by default, plus the
     # non stop words, so 5 + [phrase, mot, dingue] = 8
     ngrams = text.ngrams(min_length=1)
     self.assertEqual(len(ngrams), 8)
Ejemplo n.º 3
0
 def test_should_not_return_ngrams_longer_than_max_length(self):
     """
     Passing min_count=1, all the ngrams >= bigram should be returned.
     """
     text = SemanticalTagger("Une phrase avec un mot dingue. "
                             "Une autre phrase avec le même mot dingue.")
     # We should only get:
     # - phrase, avec, un, mot
     # - mot, dingue
     ngrams = text.ngrams(max_length=4)
     self.assertEqual(len(ngrams), 2)
Ejemplo n.º 4
0
 def __call__(self, request, *args, **kwargs):
     c = {}
     if request.method == "POST":
         form = SulciForm(request.POST)
         if form.is_valid():
             db_name = form.cleaned_data["corpus"]
             with UseDB(db_name):
                 t1 = time.time()
                 content = form.cleaned_data["content"]
                 limit = form.cleaned_data["limit"]
                 min_score = form.cleaned_data["min_score"]
                 if form.cleaned_data["debug"]:
                     debug = []
                     handler = MemoryStorageHandler(10, target=debug)
                     formatter = HTMLColorFormatter("%(message)s")
                     handler.setFormatter(formatter)
                     sulci_logger.addHandler(handler)
                 S = SemanticalTagger(content)
                 descriptors = [
                     (unicode(d), round(score, 2))
                     for d, score in S.get_descriptors(min_score)[:limit]
                 ]
                 if form.cleaned_data['keyentities']:
                     sorted_ke = sorted(
                         S.keyentities,
                         key=lambda k: k.frequency_relative_pmi_confidence,
                         reverse=True)
                     keyentities = [
                         (unicode(k),
                          round(k.frequency_relative_pmi_confidence * 100,
                                2)) for k in sorted_ke
                     ]
                 else:
                     keyentities = None
                 c = {
                     "descriptors": descriptors,
                     "keyentities": keyentities,
                 }
                 if form.cleaned_data["debug"]:
                     S.debug()
                     handler.flush()
                     c["debug"] = [handler.format(d) for d in debug]
                 t2 = time.time()
                 c['time'] = round(t2 - t1, 2)
         else:
             c = {'errors': form.errors}
     else:
         form = SulciForm()
         for field_name, field in form.fields.iteritems():
             c[field_name] = field.help_text
     return HttpResponse(json.dumps(c), content_type="application/json")
    def train(self, inst):
        """
        For the moment, human defined descriptors are a string with "," separator.
        """
        if isinstance(inst, (int, str)):
            # We guess we have a pk here
            inst = config.content_model_getter(inst)
        text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
        descriptors = config.descriptors_getter(inst)
        if not descriptors or not text:
            sulci_logger.info(u"Skipping item without data")
            return
        validated_descriptors = set()
        # Retrieve descriptors
        for d in descriptors:
            if not d:
                continue
            # d = d.strip().replace(u"’", u"'")
            # We create the descriptor not in thesaurus for now
            # because descriptors in article and thesaurus are not
            # always matching. Will be improved.
            dsc, created = Descriptor.get_or_connect(name=d)
            dsc.count.hincrby(1)
            # Retrieve the primeval value
#                dsc = dsc.primeval
            validated_descriptors.add(dsc)
            if created:
                sulci_logger.info(u"Lairning descriptor not in thesaurus : %s" % unicode(dsc), "RED")
        # Retrieve keytentities :
        try:
            S = SemanticalTagger(
                text,
                thesaurus=self.thesaurus,
                pos_tagger=self.pos_tagger,
                lexicon=self.pos_tagger.lexicon
            )
            S.deduplicate_keyentities()  # During lairning, try to filter
        except ValueError:
            # SemanticalTagger raise ValueError if text is empty
            return
        current_triggers = set()
        for ke in S.keyentities:
            # Retrieve or create triggers
            t, created = Trigger.get_or_connect(original=unicode(ke))
            current_triggers.add(t)
            t.count.hincrby(1)
#            t.current_score = ke.trigger_score
        # For now, only create all the relations
        for d in validated_descriptors:
            for t in current_triggers:
                t.connect(d, 1)
Ejemplo n.º 6
0
 def validate_file(self, filepath):
     raw_output, text_content = self.split_file_content(filepath)
     S = SemanticalTagger(text_content)
     flat_output = []
     for ke in S.keyentities:
         flat_output.append(" ".join(stemm.main_occurrence.lemme
                                     for stemm in ke))
     return self.compare_lists(raw_output, flat_output)
Ejemplo n.º 7
0
 def test_default(self):
     text = SemanticalTagger("Une phrase avec un mot dingue. "
                             "Une autre phrase avec le même mot dingue.")
     # We should get (stop words at end or beginning are skipped):
     expected_ngrams = set([
         (u"phrase", u"avec", u"un", u"mot", u"dingue"),
         (u"phrase", u"avec", u"un", u"mot"),
         (u"phrase", u"avec", u"le", u"même", u"mot", u"dingue"),
         (u"phrase", u"avec", u"le", u"même", u"mot"),
         (u"mot", u"dingue"),
     ])
     ngrams = text.ngrams()
     self.assertEqual(len(ngrams), 5)
     flat_ngrams = set()
     for ngram in ngrams:
         flat_ngrams.add(
             tuple(stemm.main_occurrence.lemme for stemm in ngram))
     self.assertEqual(expected_ngrams, flat_ngrams)
Ejemplo n.º 8
0
 def handle(self, *args):
     if not self.PK:
         sulci_logger.info(u"A PK is needed. Use -k xxx", "RED")
     else:
         C = Corpus()
         L = Lexicon()
         P = PosTagger(lexicon=L)
         M = Lemmatizer(L)
         a = config.content_model_getter(self.PK)
         t = getattr(a, config.SULCI_CONTENT_PROPERTY)
         T = Thesaurus()
         S = SemanticalTagger(t, T, P, lexicon=L)
         if __debug__:
             S.debug()
         sulci_logger.info(u"Scored descriptors", "YELLOW", True)
         for d, value in S.descriptors:
             sulci_logger.info(u"%s %f" % (unicode(d), value), "BLUE")
         
     if self.IPDB:
         import ipdb; ipdb.set_trace()
 def train(self, inst):
     if isinstance(inst, (int, str)):
         # We guess we have a pk here
         inst = config.content_model_getter(inst)
     text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
     try:
         S = SemanticalTagger(
             text,
             thesaurus=self.thesaurus,
             pos_tagger=self.pos_tagger,
             lexicon=self.pos_tagger.lexicon
         )
         S.deduplicate_keyentities()  # During lairning, try to filter
     except ValueError:
         # SemanticalTagger raise ValueError if text is empty
         return
     # We want also the unigrams
     # Note that the stopwords will not be returned
     ngrams = S.ngrams(min_length=1, max_length=5)
     for key, values in ngrams.iteritems():
         self.global_pmi.add_ngram(values['stemms'], amount=values['count'])