def train(self, inst):
        """
        For the moment, human defined descriptors are a string with "," separator.
        """
        if isinstance(inst, (int, str)):
            # We guess we have a pk here
            inst = config.content_model_getter(inst)
        text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
        descriptors = config.descriptors_getter(inst)
        if not descriptors or not text:
            sulci_logger.info(u"Skipping item without data")
            return
        validated_descriptors = set()
        # Retrieve descriptors
        for d in descriptors:
            if not d:
                continue
            # d = d.strip().replace(u"’", u"'")
            # We create the descriptor not in thesaurus for now
            # because descriptors in article and thesaurus are not
            # always matching. Will be improved.
            dsc, created = Descriptor.get_or_connect(name=d)
            dsc.count.hincrby(1)
            # Retrieve the primeval value
#                dsc = dsc.primeval
            validated_descriptors.add(dsc)
            if created:
                sulci_logger.info(u"Lairning descriptor not in thesaurus : %s" % unicode(dsc), "RED")
        # Retrieve keytentities :
        try:
            S = SemanticalTagger(
                text,
                thesaurus=self.thesaurus,
                pos_tagger=self.pos_tagger,
                lexicon=self.pos_tagger.lexicon
            )
            S.deduplicate_keyentities()  # During lairning, try to filter
        except ValueError:
            # SemanticalTagger raise ValueError if text is empty
            return
        current_triggers = set()
        for ke in S.keyentities:
            # Retrieve or create triggers
            t, created = Trigger.get_or_connect(original=unicode(ke))
            current_triggers.add(t)
            t.count.hincrby(1)
#            t.current_score = ke.trigger_score
        # For now, only create all the relations
        for d in validated_descriptors:
            for t in current_triggers:
                t.connect(d, 1)
 def train(self, inst):
     if isinstance(inst, (int, str)):
         # We guess we have a pk here
         inst = config.content_model_getter(inst)
     text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
     try:
         S = SemanticalTagger(
             text,
             thesaurus=self.thesaurus,
             pos_tagger=self.pos_tagger,
             lexicon=self.pos_tagger.lexicon
         )
         S.deduplicate_keyentities()  # During lairning, try to filter
     except ValueError:
         # SemanticalTagger raise ValueError if text is empty
         return
     # We want also the unigrams
     # Note that the stopwords will not be returned
     ngrams = S.ngrams(min_length=1, max_length=5)
     for key, values in ngrams.iteritems():
         self.global_pmi.add_ngram(values['stemms'], amount=values['count'])