コード例 #1
0
ファイル: thesaurus.py プロジェクト: liberation/sulci
 def load_triggers(self):
     sulci_logger.debug("Loading triggers...", "YELLOW", True)
     f = codecs.open(get_dir() + "corpus/triggers.trg", "r", "utf-8")
     for idx, line in enumerate(f.readlines()):
         #TODO check line validity
         t, created = Trigger.get_or_create(line, self, parent=self, original=line)
         self._triggers.add(t)
     f.close()
コード例 #2
0
 def load_triggers(self):
     sulci_logger.debug("Loading triggers...", "YELLOW", True)
     f = codecs.open(get_dir() + "corpus/triggers.trg", "r", "utf-8")
     for idx, line in enumerate(f.readlines()):
         #TODO check line validity
         t, created = Trigger.get_or_create(line,
                                            self,
                                            parent=self,
                                            original=line)
         self._triggers.add(t)
     f.close()
コード例 #3
0
ファイル: textmining.py プロジェクト: jmvanel/sulci
 def create_stemm(self):
     self._stemms = set() # A set because order don't mind
                          # And we want no duplicates
     for tkn in self.tokens:
         self.lemmatizer.do(tkn)
         # We don't take the sg or pl in the tag name
         stm, created = Stemm.get_or_create((unicode(tkn.lemme), tkn.tag.split(u":")[0]), self, original=unicode(tkn.lemme), text=self)
         stm.occurrences.append(tkn)
         tkn.stemm = stm
         self._stemms.add(stm)
     sulci_logger.debug("Initial stemms", "BLUE", highlight=True)
     sulci_logger.debug([unicode(s) for s in self._stemms], "CYAN")
コード例 #4
0
ファイル: textmining.py プロジェクト: jmvanel/sulci
 def make_keyentities(self, min_length = 2, max_length = 10, min_count = 2):
     # From ngrams
     keyentities = []
     candidates = self.ngrams()
     # Candidates are tuples : (ngram, ngram_score)
     sulci_logger.debug("Ngrams candidates", "BLUE", highlight=True)
     for c in candidates:
         sulci_logger.debug([unicode(s) for s in c[0]], "CYAN")
     for candidate in candidates:
         kp, created = KeyEntity.get_or_create([unicode(s.main_occurrence) for s in candidate[0]],
                                               self,
                                               stemms=candidate[0], 
                                               count=candidate[1],
                                               text=self)
         keyentities.append(kp)
     # From frequency
     candidates = self.keystems()
     sulci_logger.debug("Frequent stems candidates", "BLUE", highlight=True)
     for c in candidates:
         sulci_logger.debug(unicode(c), "CYAN")
     for candidate in candidates:
         kp, created = KeyEntity.get_or_create([unicode(candidate.main_occurrence)], 
                                               self,
                                               stemms=[candidate], 
                                               count=candidate.count,
                                               text=self)
         keyentities.append(kp)
     self.keyentities = keyentities
コード例 #5
0
ファイル: textmining.py プロジェクト: yohanboniface/sulci
 def make_keyentities(self, min_length=2, max_length=10, min_count=2):
     # From ngrams
     keyentities = []
     candidates = self.filtered_ngrams()
     # Candidates are tuples : (ngram, ngram_score)
     sulci_logger.debug("Ngrams candidates", "BLUE", highlight=True)
     for c in candidates:
         sulci_logger.debug([unicode(s) for s in c[0]], "CYAN")
     for candidate in candidates:
         kp, created = KeyEntity.get_or_create(
             [unicode(s.main_occurrence) for s in candidate[0]],
             self.text,
             stemms=candidate[0],
             count=candidate[1],
             text=self.text)
         keyentities.append(kp)
     # From frequency
     candidates = self.keystems()
     sulci_logger.debug("Frequent stems candidates", "BLUE", highlight=True)
     for c in candidates:
         sulci_logger.debug(unicode(c), "CYAN")
     for candidate in candidates:
         kp, created = KeyEntity.get_or_create(
             [unicode(candidate.main_occurrence)],
             self.text,
             stemms=[candidate],
             count=candidate.count,
             text=self.text)
         keyentities.append(kp)
     self.keyentities = keyentities
     self.deduplicate_keyentities()
コード例 #6
0
ファイル: lexicon.py プロジェクト: liberation/sulci
    def loaded(self):
        """
        Load lexicon in RAM, from file.

        The representation will be a dict {"word1": [{tag1 : lemme1}]}
        """
        if not self.PATH in self._loaded:  # Caching and lazy loading
            sulci_logger.debug("Loading lexicon...", "RED", True)
            lx = load_file("%s/lexicon.lxc" % self.PATH)
            self._loaded[self.PATH] = {}
            for line in lx.split("\n"):
                if line:
                    lexicon_entity = LexiconEntity(line)
                    self.add_factors(lexicon_entity.word)
                    self._loaded[self.PATH][lexicon_entity.word] = lexicon_entity
        return self._loaded[self.PATH]
コード例 #7
0
ファイル: textmining.py プロジェクト: yohanboniface/sulci
 def create_stemm(self):
     self._stemms = set()  # A set because order don't mind
     # And we want no duplicates
     self.lemmatizer.do(self.tokens)
     for tkn in self.tokens:
         # We don't take the sg or pl in the tag name
         stm, created = Stemm.get_or_create(
             (unicode(tkn.lemme), tkn.tag.split(u":")[0]),
             self,
             original=unicode(tkn.lemme),
             text=self)
         stm.occurrences.append(tkn)
         tkn.stemm = stm
         self._stemms.add(stm)
     sulci_logger.debug("Initial stemms", "BLUE", highlight=True)
     sulci_logger.debug([unicode(s) for s in self._stemms], "CYAN")
コード例 #8
0
    def loaded(self):
        """
        Load lexicon in RAM, from file.

        The representation will be a dict {"word1": [{tag1 : lemme1}]}
        """
        if not self.PATH in self._loaded:  # Caching and lazy loading
            sulci_logger.debug("Loading lexicon...", "RED", True)
            lx = load_file("%s/lexicon.lxc" % self.PATH)
            self._loaded[self.PATH] = {}
            for line in lx.split("\n"):
                if line:
                    lexicon_entity = LexiconEntity(line)
                    self.add_factors(lexicon_entity.word)
                    self._loaded[self.PATH][
                        lexicon_entity.word] = lexicon_entity
        return self._loaded[self.PATH]
コード例 #9
0
ファイル: textmining.py プロジェクト: yohanboniface/sulci
 def __gt__(self, other):
     """
     We try here to define which from two keyentities competitor is the
     best concentrate of information.
     (Remember that if an expression A is included in B, A is mathematicaly
     almost frequent than B.)
     Examples :
     - Ernesto Che Guevara, is more informative than "Che" or "Che
     Guevara", even if "Che Guevara" is very more frequent.
     - "loi Création et Internet" is more concentrate, and so more informative,
     than "le projet de loi Création et Internet"
     - "ministère de la Culture" is more informative than "Culture" and "ministère"
     """
     #First of all, we check that both are competitors
     if not self in other and not other in self:
         raise ValueError("keyentities must be parent for this comparison.")
     sulci_logger.debug(
         u"Comparing '%s' and '%s'" % (unicode(self), unicode(other)),
         "GRAY")
     sulci_logger.debug(self._confidences, "WHITE")
     sulci_logger.debug(other._confidences, "WHITE")
     # If there is a title in the comparison, make a special case
     if self.istitle() and other.istitle():
         # If both are title, use PMI
         return self.statistical_mutual_information_confidence(
         ) > other.statistical_mutual_information_confidence()
     elif self.istitle() or other.istitle():
         # If just one is title: do nothing
         # Idea : prevent from deleting "Laurent Gbagbo" because of "président
         # sortant Laurent Gbagbo" many times in a text (as an example)
         # And in the same time prevent from deleting "Forces républicaines"
         # because "Forces" is title, and not "Forces républicaines"
         # This make more false positive cases, but make also more true positive
         # More false positive means also more noise, and so
         # maybe there the scenario for training should be different
         # to create the less noise relations possible
         return False
     else:  # No title in the comparison
         if not self.statistical_mutual_information_confidence(
         ) == other.statistical_mutual_information_confidence():
             return self.statistical_mutual_information_confidence(
             ) > other.statistical_mutual_information_confidence()
         elif not self.heuristical_mutual_information_confidence(
         ) == other.heuristical_mutual_information_confidence():
             return self.heuristical_mutual_information_confidence(
             ) > other.heuristical_mutual_information_confidence()
         elif not self.confidence == other.confidence:
             return self.confidence > other.confidence
         elif not len(self) == len(other):
             return len(self) > len(other)
         else:
             return False
コード例 #10
0
ファイル: textmining.py プロジェクト: liberation/sulci
 def __gt__(self, other):
     """
     We try here to define which from two keyentities competitor is the
     best concentrate of information.
     (Remember that if an expression A is included in B, A is mathematicaly
     almost frequent than B.)
     Examples :
     - Ernesto Che Guevara, is more informative than "Che" or "Che
     Guevara", even if "Che Guevara" is very more frequent.
     - "loi Création et Internet" is more concentrate, and so more informative,
     than "le projet de loi Création et Internet"
     - "ministère de la Culture" is more informative than "Culture" and "ministère"
     """
     #First of all, we check that both are competitors
     if not self in other and not other in self:
         raise ValueError("keyentities must be parent for this comparison.")
     sulci_logger.debug(u"Comparing '%s' and '%s'" % (unicode(self), unicode(other)), "GRAY")
     sulci_logger.debug(self._confidences, "WHITE")
     sulci_logger.debug(other._confidences, "WHITE")
     # If there is a title in the comparison, make a special case
     if self.istitle() and other.istitle():
         # If both are title, use PMI
         return self.statistical_mutual_information_confidence() > other.statistical_mutual_information_confidence()
     elif self.istitle() or other.istitle():
         # If just one is title: do nothing
         # Idea : prevent from deleting "Laurent Gbagbo" because of "président
         # sortant Laurent Gbagbo" many times in a text (as an example)
         # And in the same time prevent from deleting "Forces républicaines"
         # because "Forces" is title, and not "Forces républicaines"
         # This make more false positive cases, but make also more true positive
         # More false positive means also more noise, and so
         # maybe there the scenario for training should be different
         # to create the less noise relations possible
         return False
     else:  # No title in the comparison
         if not self.statistical_mutual_information_confidence() == other.statistical_mutual_information_confidence():
             return self.statistical_mutual_information_confidence() > other.statistical_mutual_information_confidence()
         elif not self.heuristical_mutual_information_confidence() == other.heuristical_mutual_information_confidence():
             return self.heuristical_mutual_information_confidence() > other.heuristical_mutual_information_confidence()
         elif not self.confidence == other.confidence:
             return self.confidence > other.confidence
         elif not len(self) == len(other):
             return len(self) > len(other)
         else:
             return False
コード例 #11
0
def log(s, color=None, highlight=False, mode=None):
    sulci_logger.debug(s, color, highlight)
コード例 #12
0
ファイル: utils.py プロジェクト: jmvanel/sulci
def log(s, color=None, highlight=False, mode=None):
    sulci_logger.debug(s, color, highlight)    
コード例 #13
0
ファイル: textmining.py プロジェクト: liberation/sulci
 def debug(self):
     sulci_logger.debug("Normalized text", "WHITE")
     sulci_logger.debug(self.text.normalized_text, "WHITE")
     sulci_logger.debug("Number of words", "WHITE")
     sulci_logger.debug(self.text.words_count(), "GRAY")
     sulci_logger.debug("Number of meaning words", "WHITE")
     sulci_logger.debug(self.text.meaning_words_count(), "GRAY")
     sulci_logger.debug("Number of differents words", "WHITE")
     sulci_logger.debug(len(self.text.distinct_words()), "GRAY")
     sulci_logger.debug("Frequents stemms", "WHITE")
     sulci_logger.debug([(unicode(s), s.count) for s in self.keystems()], "GRAY")
     sulci_logger.debug("Lexical diversity", "WHITE")
     sulci_logger.debug(1.0 * len(self.text.words) / len(set(self.text.distinct_words())), "GRAY")
     sulci_logger.debug("Tagged words", "WHITE")
     sulci_logger.debug([(unicode(t), t.tag) for t in self.text.tokens], "GRAY")
     sulci_logger.debug("Sentences", "WHITE")
     for sample in self.text.samples:
         sulci_logger.debug(sample, "GRAY")
     sulci_logger.debug("Final keyentities", "WHITE")
     for kp in sorted(self.keyentities, key=lambda kp: kp.keyconcept_confidence):
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.confidence), "YELLOW")
         sulci_logger.debug(kp._confidences, "GRAY")
     sulci_logger.debug(u"Keyentities by keyconcept_confidence", "BLUE", True)
     for kp in sorted(self.keyentities, key=lambda kp: kp.keyconcept_confidence, reverse=True)[:10]:
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.keyconcept_confidence), "YELLOW")
     sulci_logger.debug(u"Keyentities by statistical_mutual_information_confidence", "BLUE", True)
     for kp in sorted(self.keyentities, key=lambda kp: kp._confidences["statistical_mutual_information"], reverse=True)[:10]:
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp._confidences["statistical_mutual_information"]), "YELLOW")
     sulci_logger.debug(u"Keyentities by pos_confidence", "BLUE", True)
     for kp in sorted(self.keyentities, key=lambda kp: kp._confidences["pos"], reverse=True)[:10]:
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp._confidences["pos"]), "YELLOW")
     sulci_logger.debug(u"Keyentities by frequency_relative_pmi_confidence", "BLUE", True)
     for kp in sorted(self.keyentities, key=lambda kp: kp.frequency_relative_pmi_confidence, reverse=True)[:10]:
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.frequency_relative_pmi_confidence), "YELLOW")
     sulci_logger.debug(u"Keyentities by keyconcept_confidence * pos confidence", "BLUE", True)
     for kp in sorted(self.keyentities, key=lambda kp: kp.keyconcept_confidence * kp._confidences["pos"], reverse=True)[:10]:
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.keyconcept_confidence * kp._confidences["pos"]), "YELLOW")
     sulci_logger.debug(u"Keyentities by nrelative * pos confidence", "BLUE", True)
     for kp in sorted(self.keyentities, key=lambda kp: kp.trigger_score, reverse=True)[:20]:
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.trigger_score), "YELLOW")
     sulci_logger.debug(u"Keyentities by gobal pmi confidence", "BLUE", True)
     # for kp in sorted(self.keyentities, key=lambda kp: kp._confidences['global_mutual_information'], reverse=True)[:20]:
     #     sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp._confidences['global_mutual_information']), "YELLOW")
     sulci_logger.debug(u"Triggers and relation with descriptors", "BLUE", True)
     for t, score in self.triggers:
         if len(t._synapses) > 0:
             sulci_logger.debug(u"%s (Local score : %f)" % (unicode(t), score), "GRAY", highlight=True)
             sulci_logger.debug(u"Trigger total count : %s" % t.count.hget(), "GRAY")
             for d in sorted(t, key=lambda t2d: t2d.weight, reverse=True):
                 sulci_logger.debug(u"%s %f" % (unicode(d), d.pondered_weight), "CYAN")
コード例 #14
0
ファイル: textmining.py プロジェクト: jmvanel/sulci
    def debug(self):
        sulci_logger.debug("Normalized text", "WHITE")
        sulci_logger.debug(self.normalized_text, "WHITE")
        sulci_logger.debug("Number of words", "WHITE")
        sulci_logger.debug(self.words_count(), "GRAY")
        sulci_logger.debug("Number of meaning words", "WHITE")
        sulci_logger.debug(self.meaning_words_count(), "GRAY")
        sulci_logger.debug("Number of differents words", "WHITE")
        sulci_logger.debug(len(self.distinct_words()), "GRAY")
        sulci_logger.debug("Frequents stemms", "WHITE")
        sulci_logger.debug([(unicode(s), s.count) for s in self.keystems()], "GRAY")
        sulci_logger.debug("Lexical diversity", "WHITE")
        sulci_logger.debug(1.0 * len(self.words) / len(set(self.distinct_words())), "GRAY")
        sulci_logger.debug("Tagged words", "WHITE")
        sulci_logger.debug([(unicode(t), t.tag) for t in self.tokens], "GRAY")
        sulci_logger.debug("Sentences", "WHITE")
        for sample in self.samples:
            sulci_logger.debug(sample, "GRAY")
#        sulci_logger.debug("Ngrams", "WHITE")
#        sulci_logger.debug(self.ngrams(), GRAY)
#        sulci_logger.debug("Thesaurus", "WHITE")
#        for kp in self.keyentities:
#            if kp in self.thesaurus:
#                sulci_logger.debug(u"%s in thesaurus => %s" % (unicode(kp), self.thesaurus[kp]), "BLUE")
        sulci_logger.debug("Final keyentities", "WHITE")
        for kp in sorted(self.keyentities, key=lambda kp: kp.keyconcept_confidence):
            sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.confidence), "YELLOW")
#            if kp.collocation_confidence > 1:
#                sulci_logger.debug(u"Collocation confidence => %f" % kp.collocation_confidence, "BLUE")
##                    print self.thesaurus[kp].id, self.thesaurus[kp].line
#            if kp.keyconcept_confidence > 0.01:
#                sulci_logger.debug(u"Keyconcept confidence (%f)" % kp.keyconcept_confidence, "CYAN")
#            if kp.descriptor is not None:
#                sulci_logger.debug(u"%s in thesaurus => %s" % (unicode(kp), unicode(kp.descriptor)), "MAGENTA")
            sulci_logger.debug(kp._confidences, "GRAY")
        sulci_logger.debug(u"Keyentities by keyconcept_confidence", "BLUE", True)
        for kp in sorted(self.keyentities, key=lambda kp: kp.keyconcept_confidence, reverse=True)[:10]:
            sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.keyconcept_confidence), "YELLOW")
        sulci_logger.debug(u"Keyentities by statistical_mutual_information_confidence", "BLUE", True)
        for kp in sorted(self.keyentities, key=lambda kp: kp._confidences["statistical_mutual_information"], reverse=True)[:10]:
            sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp._confidences["statistical_mutual_information"]), "YELLOW")
        sulci_logger.debug(u"Keyentities by pos_confidence", "BLUE", True)
        for kp in sorted(self.keyentities, key=lambda kp: kp._confidences["pos"], reverse=True)[:10]:
            sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp._confidences["pos"]), "YELLOW")
#        sulci_logger.debug(u"Keyentities by thesaurus_confidence", "BLUE", True)
#        for kp in sorted((kp for kp in self.keyentities if kp.descriptor is not None), key=lambda kp: kp._confidences["thesaurus"], reverse=True):
#            sulci_logger.debug(u"%s (%s)" % (unicode(kp), unicode(kp.descriptor)), "YELLOW")
        sulci_logger.debug(u"Keyentities by frequency_relative_pmi_confidence", "BLUE", True)
        for kp in sorted(self.keyentities, key=lambda kp: kp.frequency_relative_pmi_confidence, reverse=True)[:10]:
            sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.frequency_relative_pmi_confidence), "YELLOW")
        sulci_logger.debug(u"Keyentities by keyconcept_confidence * pos confidence", "BLUE", True)
        for kp in sorted(self.keyentities, key=lambda kp: kp.keyconcept_confidence * kp._confidences["pos"], reverse=True)[:10]:
            sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.keyconcept_confidence * kp._confidences["pos"]), "YELLOW")
        sulci_logger.debug(u"Keyentities by nrelative * pos confidence", "BLUE", True)
        for kp in sorted(self.keyentities, key=lambda kp: kp.trigger_score, reverse=True)[:20]:
            sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.trigger_score), "YELLOW")
        sulci_logger.debug(u"Triggers and relation with descriptors", "BLUE", True)
        for t, score in self.triggers:
            if len(t._synapses) > 0:
                sulci_logger.debug(u"%s (Local score : %f)" % (unicode(t), score), "GRAY", highlight=True)
                sulci_logger.debug(u"Trigger total count : %d" % t.count, "GRAY")
                for d in sorted(t, key=lambda t2d: t2d.weight, reverse=True):
                    sulci_logger.debug(u"%s %f" % (unicode(d), d.pondered_weight), "CYAN")
コード例 #15
0
ファイル: textmining.py プロジェクト: jmvanel/sulci
 def deduplicate_keyentities(self):
     """
     If a KeyEntity is contained in an other (same stemms in same place) longuer
     delete the one with the smaller confidence, or the shortest if same confidence
     We have to begin from the shortest ones.
     """
     sulci_logger.debug(u"Deduplicating keyentities...", "BLUE", highlight=True)
     tmp_keyentities = sorted(self.keyentities, key=lambda kp: len(kp))
     sulci_logger.debug([unicode(kp) for kp in tmp_keyentities], "GRAY")
     for idx, one in enumerate(tmp_keyentities):
         for two in tmp_keyentities[idx+1:]:
             if one in self.keyentities and two in self.keyentities:
                 if one.is_duplicate(two):
                     sulci_logger.debug(u"%s is duplicate %s" % (unicode(one), unicode(two)), "MAGENTA")
                     if one > two:#and not two.is_strong()
                         sulci_logger.debug(u"%s will be deleted" % unicode(two), "RED")
                         self.keyentities.remove(two)
                     elif two > one:
                         sulci_logger.debug(u"%s will be deleted" % unicode(one), "RED")
                         self.keyentities.remove(one)
                     else:
                         sulci_logger.debug(u"No deletion")
     sulci_logger.debug(u"... keyentities deduplicated", "BLUE", highlight=True)
コード例 #16
0
ファイル: textmining.py プロジェクト: yohanboniface/sulci
    def deduplicate_keyentities(self):
        """
        If a KeyEntity is contained in an other (same stemms in same place) longuer
        delete the one with the smaller confidence, or the shortest if same confidence
        We have to begin from the shortest ones.
        """
        sulci_logger.debug(u"Deduplicating keyentities...",
                           "BLUE",
                           highlight=True)
        # It's important to start from the longer ones, to use longer as group
        # keys
        tmp_keyentities = sorted(self.keyentities,
                                 key=lambda kp: len(kp),
                                 reverse=True)
        sulci_logger.debug([unicode(kp) for kp in tmp_keyentities], "GRAY")
        groups = {}
        # Try to group the keyentities to avoid exponantial loops
        # (comparing every ke with each other does not sounds like a
        # good idea...)
        # Just remember that a ke can be linked to more than one group
        # ("Phase" linked to "essai de Phase I" and "essai de Phase II"
        # for example)
        for ke_candidate in tmp_keyentities:
            group_found = False
            for ke_parent in groups.iterkeys():
                if ke_candidate.is_duplicate(ke_parent):
                    groups[ke_parent].append(ke_candidate)
                    group_found = True
            if not group_found:
                groups[ke_candidate] = []
                groups[ke_candidate].append(ke_candidate)

        # Deduplicate "group by group":
        # - less looping than comparing all the dataset
        # - some ke can be removed in one group but keeped in another
        for group in groups.itervalues():
            for idx, one in enumerate(group[:]):
                for two in group[idx + 1:]:
                    if one in group and two in group:
                        if one.is_duplicate(two):
                            sulci_logger.debug(
                                u"%s is duplicate %s" %
                                (unicode(one), unicode(two)), "MAGENTA")
                            if one > two:  # and not two.is_strong()
                                sulci_logger.debug(
                                    u"%s will be deleted" % unicode(two),
                                    "RED")
                                group.remove(two)
                            elif two > one:
                                sulci_logger.debug(
                                    u"%s will be deleted" % unicode(one),
                                    "RED")
                                group.remove(one)
                            else:
                                sulci_logger.debug(u"No deletion")

        # Finally add the keeped ke in the property
        tmp_keyentities = []
        for group in groups.itervalues():
            for ke in group:
                if not ke in tmp_keyentities:
                    tmp_keyentities.append(ke)
        self.keyentities = tmp_keyentities
        sulci_logger.debug(u"... keyentities deduplicated",
                           "BLUE",
                           highlight=True)
コード例 #17
0
ファイル: textmining.py プロジェクト: yohanboniface/sulci
 def debug(self):
     sulci_logger.debug("Normalized text", "WHITE")
     sulci_logger.debug(self.text.normalized_text, "WHITE")
     sulci_logger.debug("Number of words", "WHITE")
     sulci_logger.debug(self.text.words_count(), "GRAY")
     sulci_logger.debug("Number of meaning words", "WHITE")
     sulci_logger.debug(self.text.meaning_words_count(), "GRAY")
     sulci_logger.debug("Number of differents words", "WHITE")
     sulci_logger.debug(len(self.text.distinct_words()), "GRAY")
     sulci_logger.debug("Frequents stemms", "WHITE")
     sulci_logger.debug([(unicode(s), s.count) for s in self.keystems()],
                        "GRAY")
     sulci_logger.debug("Lexical diversity", "WHITE")
     sulci_logger.debug(
         1.0 * len(self.text.words) / len(set(self.text.distinct_words())),
         "GRAY")
     sulci_logger.debug("Tagged words", "WHITE")
     sulci_logger.debug([(unicode(t), t.tag) for t in self.text.tokens],
                        "GRAY")
     sulci_logger.debug("Sentences", "WHITE")
     for sample in self.text.samples:
         sulci_logger.debug(sample, "GRAY")
     sulci_logger.debug("Final keyentities", "WHITE")
     for kp in sorted(self.keyentities,
                      key=lambda kp: kp.keyconcept_confidence):
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.confidence),
                            "YELLOW")
         sulci_logger.debug(kp._confidences, "GRAY")
     sulci_logger.debug(u"Keyentities by keyconcept_confidence", "BLUE",
                        True)
     for kp in sorted(self.keyentities,
                      key=lambda kp: kp.keyconcept_confidence,
                      reverse=True)[:10]:
         sulci_logger.debug(
             u"%s (%f)" % (unicode(kp), kp.keyconcept_confidence), "YELLOW")
     sulci_logger.debug(
         u"Keyentities by statistical_mutual_information_confidence",
         "BLUE", True)
     for kp in sorted(self.keyentities,
                      key=lambda kp: kp._confidences[
                          "statistical_mutual_information"],
                      reverse=True)[:10]:
         sulci_logger.debug(
             u"%s (%f)" %
             (unicode(kp),
              kp._confidences["statistical_mutual_information"]), "YELLOW")
     sulci_logger.debug(u"Keyentities by pos_confidence", "BLUE", True)
     for kp in sorted(self.keyentities,
                      key=lambda kp: kp._confidences["pos"],
                      reverse=True)[:10]:
         sulci_logger.debug(
             u"%s (%f)" % (unicode(kp), kp._confidences["pos"]), "YELLOW")
     sulci_logger.debug(u"Keyentities by frequency_relative_pmi_confidence",
                        "BLUE", True)
     for kp in sorted(self.keyentities,
                      key=lambda kp: kp.frequency_relative_pmi_confidence,
                      reverse=True)[:10]:
         sulci_logger.debug(
             u"%s (%f)" %
             (unicode(kp), kp.frequency_relative_pmi_confidence), "YELLOW")
     sulci_logger.debug(
         u"Keyentities by keyconcept_confidence * pos confidence", "BLUE",
         True)
     for kp in sorted(self.keyentities,
                      key=lambda kp: kp.keyconcept_confidence * kp.
                      _confidences["pos"],
                      reverse=True)[:10]:
         sulci_logger.debug(
             u"%s (%f)" % (unicode(kp), kp.keyconcept_confidence *
                           kp._confidences["pos"]), "YELLOW")
     sulci_logger.debug(u"Keyentities by nrelative * pos confidence",
                        "BLUE", True)
     for kp in sorted(self.keyentities,
                      key=lambda kp: kp.trigger_score,
                      reverse=True)[:20]:
         sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp.trigger_score),
                            "YELLOW")
     sulci_logger.debug(u"Keyentities by gobal pmi confidence", "BLUE",
                        True)
     # for kp in sorted(self.keyentities, key=lambda kp: kp._confidences['global_mutual_information'], reverse=True)[:20]:
     #     sulci_logger.debug(u"%s (%f)" % (unicode(kp), kp._confidences['global_mutual_information']), "YELLOW")
     sulci_logger.debug(u"Triggers and relation with descriptors", "BLUE",
                        True)
     for t, score in self.triggers:
         if len(t._synapses) > 0:
             sulci_logger.debug(u"%s (Local score : %f)" %
                                (unicode(t), score),
                                "GRAY",
                                highlight=True)
             sulci_logger.debug(
                 u"Trigger total count : %s" % t.count.hget(), "GRAY")
             for d in sorted(t, key=lambda t2d: t2d.weight, reverse=True):
                 sulci_logger.debug(
                     u"%s %f" % (unicode(d), d.pondered_weight), "CYAN")
コード例 #18
0
ファイル: textmining.py プロジェクト: liberation/sulci
    def deduplicate_keyentities(self):
        """
        If a KeyEntity is contained in an other (same stemms in same place) longuer
        delete the one with the smaller confidence, or the shortest if same confidence
        We have to begin from the shortest ones.
        """
        sulci_logger.debug(u"Deduplicating keyentities...", "BLUE", highlight=True)
        # It's important to start from the longer ones, to use longer as group
        # keys
        tmp_keyentities = sorted(self.keyentities, key=lambda kp: len(kp), reverse=True)
        sulci_logger.debug([unicode(kp) for kp in tmp_keyentities], "GRAY")
        groups = {}
        # Try to group the keyentities to avoid exponantial loops
        # (comparing every ke with each other does not sounds like a
        # good idea...)
        # Just remember that a ke can be linked to more than one group
        # ("Phase" linked to "essai de Phase I" and "essai de Phase II"
        # for example)
        for ke_candidate in tmp_keyentities:
            group_found = False
            for ke_parent in groups.iterkeys():
                if ke_candidate.is_duplicate(ke_parent):
                    groups[ke_parent].append(ke_candidate)
                    group_found = True
            if not group_found:
                groups[ke_candidate] = []
                groups[ke_candidate].append(ke_candidate)

        # Deduplicate "group by group":
        # - less looping than comparing all the dataset
        # - some ke can be removed in one group but keeped in another
        for group in groups.itervalues():
            for idx, one in enumerate(group[:]):
                for two in group[idx + 1:]:
                    if one in group and two in group:
                        if one.is_duplicate(two):
                            sulci_logger.debug(u"%s is duplicate %s" % (unicode(one), unicode(two)), "MAGENTA")
                            if one > two:  # and not two.is_strong()
                                sulci_logger.debug(u"%s will be deleted" % unicode(two), "RED")
                                group.remove(two)
                            elif two > one:
                                sulci_logger.debug(u"%s will be deleted" % unicode(one), "RED")
                                group.remove(one)
                            else:
                                sulci_logger.debug(u"No deletion")

        # Finally add the keeped ke in the property
        tmp_keyentities = []
        for group in groups.itervalues():
            for ke in group:
                if not ke in tmp_keyentities:
                    tmp_keyentities.append(ke)
        self.keyentities = tmp_keyentities
        sulci_logger.debug(u"... keyentities deduplicated", "BLUE", highlight=True)