Example #1
0
def merge_descriptors(origin, destination, force=False):
    """
    Move all relations from a descriptor `origin` to another descriptor
    `destination`, and make `origin` an alias of `destination`.

    Takes all relations of a descriptor, add or create the weigth
    of these relations to the destination descriptor relations, and delete
    the origin descriptors relations.
    """
    if origin == destination:
        raise ValueError("Origin and destination can't be equal !")
    origin_count = len(origin.synapses)
    destination_count = len(destination.synapses)
    if origin_count > destination_count and not force:
        raise ValueError(
            "Origin has more relations than destination, use force.")

    for relation in origin.synapses.instances():
        sulci_logger.info(u"Handle relation %s" % unicode(relation))
        trigger = relation.trigger
        score = relation.weight.hget()

        # We create or update the relation from the trigger to the destination
        # descriptor
        trigger.connect(destination, score)
        # Delete the original relation
        relation.delete()

    # Make origin an alias of destination
    origin.is_alias_of_id.hset(destination.pk.get())
Example #2
0
 def remove_orphans(cls):
     """
     After cleaning connections, some trigger could remain
     without any connection. So delete it.
     """
     for trigger in cls.instances().sort():
         if len(trigger._synapses) == 0:
             sulci_logger.info(u"Removing trigger %s" % trigger)
             trigger.delete()
Example #3
0
 def remove_orphans(cls):
     """
     After cleaning connections, some trigger could remain
     without any connection. So delete it.
     """
     for trigger in cls.instances().sort():
         if len(trigger._synapses) == 0:
             sulci_logger.info(u"Removing trigger %s" % trigger)
             trigger.delete()
Example #4
0
 def check(self):
     """
     Util method to try to individuate errors in the Lexicon.
     For this, we display the entries with several tags, in case
     they are wrong duplicate.
     """
     for key, entity in self.items():
         if len(entity.tags) > 1:
             sulci_logger.info(u"%s tags for %s" % (len(entity.tags), key), "RED")
             sulci_logger.info(entity.tags, "WHITE")
 def display_errors(self):
     """
     Display errors in current step.
     """
     remaining_errors = self.get_errors()
     errors_count = len(remaining_errors)
     total_words = len(self.tokens)
     sulci_logger.info(u"Remaining %d errors (%f %% of %d total words)" %
        (errors_count, 100.0 * errors_count / total_words, total_words), "RED")
     for r_error in remaining_errors:
         self.log_error(r_error)
 def check(self):
     """
     Util method to try to individuate errors in the Lexicon.
     For this, we display the entries with several tags, in case
     they are wrong duplicate.
     """
     for key, entity in self.items():
         if len(entity.tags) > 1:
             sulci_logger.info(u"%s tags for %s" % (len(entity.tags), key),
                               "RED")
             sulci_logger.info(entity.tags, "WHITE")
Example #7
0
    def remove_useless_connections(self, min=0.01):
        """
        Delete all the useless connections.

        First loop on all the descriptors to consume less RAM.
        """
        instances = TriggerToDescriptor.instances(descriptor_id=self.pk.get())
        for inst in instances:
            weight = inst.pondered_weight
            if weight < min:
                sulci_logger.info("Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s (weight: %f)" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget(), weight))
                inst.delete()
 def test_rule(self, rule):
     template = self.get_template_instance(rule)
     bad = 0
     good = 0
     for ttk in self.tokens:
         test = template.test_rule(ttk, rule)
         if test == 1:
             good += 1
         elif test == -1:
             bad += 1
     sulci_logger.info(u"%s g: %d b : %d" % (rule, good, bad), "GRAY")
     return rule, good, bad
    def train(self, inst):
        """
        For the moment, human defined descriptors are a string with "," separator.
        """
        if isinstance(inst, (int, str)):
            # We guess we have a pk here
            inst = config.content_model_getter(inst)
        text = getattr(inst, config.SULCI_CONTENT_PROPERTY)
        descriptors = config.descriptors_getter(inst)
        if not descriptors or not text:
            sulci_logger.info(u"Skipping item without data")
            return
        validated_descriptors = set()
        # Retrieve descriptors
        for d in descriptors:
            if not d:
                continue
            # d = d.strip().replace(u"’", u"'")
            # We create the descriptor not in thesaurus for now
            # because descriptors in article and thesaurus are not
            # always matching. Will be improved.
            dsc, created = Descriptor.get_or_connect(name=d)
            dsc.count.hincrby(1)
            # Retrieve the primeval value
#                dsc = dsc.primeval
            validated_descriptors.add(dsc)
            if created:
                sulci_logger.info(u"Lairning descriptor not in thesaurus : %s" % unicode(dsc), "RED")
        # Retrieve keytentities :
        try:
            S = SemanticalTagger(
                text,
                thesaurus=self.thesaurus,
                pos_tagger=self.pos_tagger,
                lexicon=self.pos_tagger.lexicon
            )
            S.deduplicate_keyentities()  # During lairning, try to filter
        except ValueError:
            # SemanticalTagger raise ValueError if text is empty
            return
        current_triggers = set()
        for ke in S.keyentities:
            # Retrieve or create triggers
            t, created = Trigger.get_or_connect(original=unicode(ke))
            current_triggers.add(t)
            t.count.hincrby(1)
#            t.current_score = ke.trigger_score
        # For now, only create all the relations
        for d in validated_descriptors:
            for t in current_triggers:
                t.connect(d, 1)
Example #10
0
    def remove_useless_connections(self, min=0.01):
        """
        Delete all the useless connections.

        First loop on all the descriptors to consume less RAM.
        """
        instances = TriggerToDescriptor.instances(descriptor_id=self.pk.get())
        for inst in instances:
            weight = inst.pondered_weight
            if weight < min:
                sulci_logger.info(
                    "Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s (weight: %f)"
                    % (inst.pk.get(), inst.trigger_id.hget(),
                       inst.descriptor_id.hget(), weight))
                inst.delete()
 def slave(self):
     self.setup_socket_slave()
     while True:
         if self.subpoller.poll(0):
             rule = self.subsocket.recv()[1:]
             if rule == "stop":
                 return
             rule = rule.decode("utf-8")
             template = self.get_template_instance(rule)
             #Apply the rule to the tokens
             sulci_logger.info(u"Applying rule %s" % rule, "RED")
             template.apply_rule(self.tokens, rule)
         if self.reppoller.poll(0):
             idx, action, rule = self.repsocket.recv_multipart()
             _, good, bad = self.test_rule(rule.decode("utf-8"))
             self.repsocket.send_multipart([idx, rule, str(good), str(bad)])
 def test_rules(self, rules_candidates):
     pondered_rules = []
     if self.mode == "master":
         # Send order
         for rule in rules_candidates:
             self.reqsocket.send_multipart(["check", rule.encode("utf-8")])
         #Receive results
         for rule in rules_candidates:
             resp = self.reqsocket.recv_multipart()
             r, good, bad = resp
             pondered_rules.append((r.decode("utf-8"), int(good), int(bad)))
             sulci_logger.info(u"Received rule %s" % r.decode("utf-8"), "MAGENTA")
         sulci_logger.info(u"All rules are received from slaves")
     else:
         for rule in rules_candidates:
             pondered_rules.append(self.test_rule(rule))
     return pondered_rules
Example #13
0
    def remove_unique_connections(cls):
        """
        Delete all the connections which occurred one during training.

        First loop on all the descriptors to consume less RAM.
        """
        for descriptor_id in Descriptor.collection():
            instances = cls.instances(descriptor_id=descriptor_id)
            for inst in instances:
                try:
                    weight = int(inst.weight.hget())
                except TypeError:
                    sulci_logger.info("Removing TriggerToDescriptor %s without weight, between Trigger %s and Descriptor %s" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget()), "RED")
                    inst.delete()
                    continue
                if weight <= 1:
                    sulci_logger.info("Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget()))
                    inst.delete()
Example #14
0
 def check_usage(self, word=None, tag=None, lemme=None, case_insensitive=False):
     """
     Find occurrences of a word or tag or both in the corpus loaded.
     """
     if not any((word, tag, lemme)):
         raise ValueError("You must specify at least a word, a tag or a lemme")
     found = False
     for t in self:
         # If a specific word is asked
         if word:
             original = t.original
             if case_insensitive:
                 word = word.lower()
                 original = original.lower()
             if not word == original:
                 continue
         # If a specific tag is asked
         if tag and not tag == t.verified_tag:
             continue
         # don't care about texts without lemmes, when a lemme is asked
         if lemme:
             if not t.sample.parent.has_verified_lemmes:
                 continue
             if not lemme == t.verified_lemme:
                 continue
         sulci_logger.info("%s :" % unicode(t.sample.parent), "YELLOW")
         sulci_logger.info(t.show_context(), "WHITE")
         found = True
     if not found:
         not_found = u'No occurrence found for'
         if word:
             not_found += " %s" % word
         if tag:
             not_found += " %s" % tag
         sulci_logger.info(not_found, "RED")
Example #15
0
 def handle(self, *args, **options):
     C = Corpus()
     L = Lexicon()
     P = PosTagger(lexicon=L)
     M = Lemmatizer(L)
     if self.WORD:
         self.WORD = self.WORD.decode("utf-8")
     if self.LEMME:
         self.LEMME = self.LEMME.decode("utf-8")
     if self.CHECK_LEXICON:
         if self.COUNT:
             sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE")
         elif self.WORD:
             L.get_entry(self.WORD)
         else:
             L.check()
     elif self.CHECK_CORPUS:
         if self.PATH:
             corpus = TextCorpus(self.PATH)
         else:
             corpus = C
         if self.COUNT:
             sulci_logger.info(u"Words in corpus : %d" % len(corpus),
                               "WHITE")
         elif self.TAGS_STATS:
             corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE)
         elif self.WORD or self.TAG or self.LEMME:
             corpus.check_usage(word=self.WORD,
                                tag=self.TAG,
                                lemme=self.LEMME,
                                case_insensitive=self.CASE_INSENSITIVE)
         else:
             corpus.check(L, self.USE_LEMMES)
     if self.DISPLAY_ERRORS:
         T = POSTrainer(P, C)
         T.display_errors()
     if self.IPDB:
         import ipdb
         ipdb.set_trace()
Example #16
0
 def handle(self, *args, **options):
     C = Corpus()
     L = Lexicon()
     P = PosTagger(lexicon=L)
     M = Lemmatizer(L)
     if self.WORD:
         self.WORD = self.WORD.decode("utf-8")
     if self.LEMME:
         self.LEMME = self.LEMME.decode("utf-8")
     if self.CHECK_LEXICON:
         if self.COUNT:
             sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE")
         elif self.WORD:
             L.get_entry(self.WORD)
         else:
             L.check()
     elif self.CHECK_CORPUS:
         if self.PATH:
             corpus = TextCorpus(self.PATH)
         else:
             corpus = C
         if self.COUNT:
             sulci_logger.info(u"Words in corpus : %d" % len(corpus), "WHITE")
         elif self.TAGS_STATS:
             corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE)
         elif self.WORD or self.TAG or self.LEMME:
             corpus.check_usage(
                 word=self.WORD, 
                 tag=self.TAG, 
                 lemme=self.LEMME,
                 case_insensitive=self.CASE_INSENSITIVE
             )
         else:
             corpus.check(L, self.USE_LEMMES)
     if self.DISPLAY_ERRORS:
         T = POSTrainer(P,C)
         T.display_errors()
     if self.IPDB:
         import ipdb; ipdb.set_trace()
Example #17
0
def merge_descriptors(origin, destination, force=False):
    """
    This facility take all relations of a descriptor, add or create the weigth
    of these relations to the destination descriptor relations, and delete
    the origin descriptors relations.
    """
    if origin == destination:
        raise ValueError("Origin and destination can't be equal !")
    origin_count = origin.triggertodescriptor_set.count()
    destination_count = destination.triggertodescriptor_set.count()
    if origin_count > destination_count and not force:
        raise ValueError("Origin has more relations than destination, use force.")
    # We loop over the origin relations
    for relation in origin.triggertodescriptor_set.all():
        sulci_logger.info(u"Handle relation %s" % unicode(relation))
        trigger = relation.trigger
        score = relation.weight
        # We create or update the relation from the trigger to the destination
        # descriptor
        trigger.connect(destination, score)
        # Delete the original relation
        relation.delete()
Example #18
0
 def do(self):
     files = self.get_files(self.VALID_EXT)
     score = 0
     for f in files:
         sulci_logger.info(" ******* File %s *******" % f, "CYAN", True)
         score += self.validate_file(f)
     sulci_logger.info(" ########## Final score ########## ", "CYAN", True)
     sulci_logger.info(score, "RED", True)
Example #19
0
 def do(self):
     files = self.get_files(self.VALID_EXT)
     score = 0
     for f in files:
         sulci_logger.info(" ******* File %s *******" % f, "CYAN", True)
         score += self.validate_file(f)
     sulci_logger.info(" ########## Final score ########## ", "CYAN", True)
     sulci_logger.info(score, "RED", True)
Example #20
0
    def remove_unique_connections(cls):
        """
        Delete all the connections which occurred one during training.

        First loop on all the descriptors to consume less RAM.
        """
        for descriptor_id in Descriptor.collection():
            instances = cls.instances(descriptor_id=descriptor_id)
            for inst in instances:
                try:
                    weight = int(inst.weight.hget())
                except TypeError:
                    sulci_logger.info(
                        "Removing TriggerToDescriptor %s without weight, between Trigger %s and Descriptor %s"
                        % (inst.pk.get(), inst.trigger_id.hget(),
                           inst.descriptor_id.hget()), "RED")
                    inst.delete()
                    continue
                if weight <= 1:
                    sulci_logger.info(
                        "Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s"
                        % (inst.pk.get(), inst.trigger_id.hget(),
                           inst.descriptor_id.hget()))
                    inst.delete()
Example #21
0
 def check_word(self, word):
     """
     Find occurrences of a word in the corpus loaded.
     """
     found = False
     for t in self.tokens:
         if word == t:
             sulci_logger.info("%s :" % unicode(t.sample.parent), "YELLOW")
             sulci_logger.info(t.show_context(), "WHITE")
             found = True
     if not found:
         sulci_logger.info(u'No occurrence found for "%s"' % word, "RED")
Example #22
0
 def check_text(self, lexicon, check_lemmes=False):
     """
     Check the text of the corpus, and try to determine if there are some errors.
     Compare with lexicon.
     """
     for t in self:
         if t in lexicon:
             # Check that current tag is in lexicon
             # If not, it *could* be an error, we display it
             if not t.verified_tag in lexicon[t]:
                 sulci_logger.info(u"Word in lexicon, but not this tag for %s (%s)" \
                                   % (unicode(t), t.verified_tag), "RED")
                 sulci_logger.info(u"In Lexicon : %s" % lexicon[t])
             if check_lemmes:
                 if t.verified_tag in lexicon[t] \
                          and t.verified_lemme != lexicon[t][t.verified_tag]:
                     sulci_logger.info(u"Word in lexicon, but not this lemme for %s (%s)" \
                                       % (unicode(t), t.verified_lemme), "BLUE")
                     sulci_logger.info(u"In Lexicon : %s" % lexicon[t][t.verified_tag], "GRAY")
Example #23
0
 def tags_stats(self):
     """
     Display tags usage stats.
     """
     d = defaultdict(int)
     for t in self:
         if t.verified_tag == None:
             sulci_logger.info(u"No verified tag for %s" % unicode(t), "RED", True)
         d[t.verified_tag] += 1
     sulci_logger.info(u"Tag usage :", "WHITE")
     for k, v in sorted(d.iteritems(), key=itemgetter(1), reverse=True):
         sulci_logger.info(u"%s => %d" % (k, v), "CYAN")
 def train(self):
     """
     Main factorized train method.
     """
     #We have to apply rules one after one to all objects
     sulci_logger.info("Begin of training session.", "WHITE", True)
     final_rules = []
     errors = self.get_errors()
     while errors:
         run_applied_rule = False
         sulci_logger.info("%d errors for now..." % len(errors), "RED", True)
         for token_with_error in errors[:]:
             rules_candidates = []
             self.log_error(token_with_error)
             # Make rules candidates
             for tpl, _ in self.template_generator.register.items():
 #                    print "tpl", tpl
                 template = self.get_template_instance(tpl)
                 rules_candidates += template.make_rules(token_with_error)
             # Test the rules
             pondered_rules = self.test_rules(rules_candidates)
             # Select one rule
             rule_candidate, score = self.select_one_rule(pondered_rules)
             # Maybe the test "rule_candidate in final_rules" have to be done before...
             if rule_candidate and not rule_candidate in final_rules:
                 # How to calculate the score min ?
                 template = self.get_template_instance(rule_candidate)
                 final_rules.append((rule_candidate, score))
                 # Apply the rule to the tokens
                 sulci_logger.info(u"Applying rule %s (%s)" % (rule_candidate, score), "RED")
                 template.apply_rule(self.tokens, rule_candidate)
                 if self.mode == "master":
                     # Send the rule to apply
                     self.pubsocket.send(" %s" % rule_candidate.encode("utf-8"))
                 run_applied_rule = True
                 # We have applied a rule, we can try another run
                 errors = self.get_errors()
                 break  # break the for
             else:  # No rule applied for this error
                 # We don't want to reprocess this error another time
                 # unless the sample (so the context) as changed.
                 token_with_error.sample.set_trained_position(token_with_error.position)
         if run_applied_rule:
             continue  # go back to while
         errors = None  # Nothing applied, we stop here.
     self.display_errors()
     self.template_generator.export(final_rules)
Example #25
0
 def handle(self, *args):
     if not self.PK:
         sulci_logger.info(u"A PK is needed. Use -k xxx", "RED")
     else:
         C = Corpus()
         L = Lexicon()
         P = PosTagger(lexicon=L)
         M = Lemmatizer(L)
         a = config.content_model_getter(self.PK)
         t = getattr(a, config.SULCI_CONTENT_PROPERTY)
         T = Thesaurus()
         S = SemanticalTagger(t, T, P, lexicon=L)
         if __debug__:
             S.debug()
         sulci_logger.info(u"Scored descriptors", "YELLOW", True)
         for d, value in S.descriptors:
             sulci_logger.info(u"%s %f" % (unicode(d), value), "BLUE")
         
     if self.IPDB:
         import ipdb; ipdb.set_trace()
Example #26
0
 def check_usage(self,
                 word=None,
                 tag=None,
                 lemme=None,
                 case_insensitive=False):
     """
     Find occurrences of a word or tag or both in the corpus loaded.
     """
     if not any((word, tag, lemme)):
         raise ValueError(
             "You must specify at least a word, a tag or a lemme")
     found = False
     for t in self:
         # If a specific word is asked
         if word:
             original = t.original
             if case_insensitive:
                 word = word.lower()
                 original = original.lower()
             if not word == original:
                 continue
         # If a specific tag is asked
         if tag and not tag == t.verified_tag:
             continue
         # don't care about texts without lemmes, when a lemme is asked
         if lemme:
             if not t.sample.parent.has_verified_lemmes:
                 continue
             if not lemme == t.verified_lemme:
                 continue
         sulci_logger.info("%s :" % unicode(t.sample.parent), "YELLOW")
         sulci_logger.info(t.show_context(), "WHITE")
         found = True
     if not found:
         not_found = u'No occurrence found for'
         if word:
             not_found += " %s" % word
         if tag:
             not_found += " %s" % tag
         sulci_logger.info(not_found, "RED")
Example #27
0
 def tags_stats(self, word=None, case_insensitive=None):
     """
     Display tags usage stats.
     """
     d = defaultdict(int)
     for t in self:
         if word:
             original = t.original
             if case_insensitive:
                 word = word.lower()
                 original = original.lower()
             if not word == original:
                 continue
         if t.verified_tag == None:
             sulci_logger.info(u"No verified tag for %s" % unicode(t), "RED", True)
         d[t.verified_tag] += 1
     log = u"Tag usage :"
     if word:
         log = u"Tag usage for word '%s'" % word
     sulci_logger.info(log, "WHITE")
     for k, v in sorted(d.iteritems(), key=itemgetter(1), reverse=True):
         sulci_logger.info(u"%s => %d" % (k, v), "CYAN")
Example #28
0
 def tags_stats(self, word=None, case_insensitive=None):
     """
     Display tags usage stats.
     """
     d = defaultdict(int)
     for t in self:
         if word:
             original = t.original
             if case_insensitive:
                 word = word.lower()
                 original = original.lower()
             if not word == original:
                 continue
         if t.verified_tag == None:
             sulci_logger.info(u"No verified tag for %s" % unicode(t),
                               "RED", True)
         d[t.verified_tag] += 1
     log = u"Tag usage :"
     if word:
         log = u"Tag usage for word '%s'" % word
     sulci_logger.info(log, "WHITE")
     for k, v in sorted(d.iteritems(), key=itemgetter(1), reverse=True):
         sulci_logger.info(u"%s => %d" % (k, v), "CYAN")
 def get_entry(self, entry):
     if entry in self:
         sulci_logger.info(unicode(self[entry]), "WHITE")
     else:
         sulci_logger.info(u'No entry for "%s"' % entry, "WHITE")
Example #30
0
    def handle(self, *args, **options):
        with UseDB(config.TRAINING_DATABASE):
            sulci_logger.info(u"STARTING TRAINING WITH DATABASE «%s»" % config.TRAINING_DATABASE, "RED", True)
            C = Corpus()
            L = Lexicon()
            M = Lemmatizer(L)
            P = PosTagger(lexicon=L)
            if self.LEXICON:
                L.make(self.FORCE)
            if self.SUBPROCESSES:
                import subprocess

                training_kind = (
                    self.LEXICAL
                    and "-e"
                    or self.LEMMATIZER
                    and "-r"
                    or self.SEMANTICAL
                    and "-n"
                    or self.PMI
                    and "-p"
                    or "-c"
                )  # CONTEXTUAL
                # Create slaves
                for i in xrange(0, self.SUBPROCESSES):
                    sulci_logger.info(u"Opening slave subprocess %d" % i, "BLUE", True)
                    sub_args = ["sulci_train.py", training_kind, "--mode=slave"]
                    if self.START is not None:
                        sub_args.append("--start=%s" % self.START)
                    subprocess.Popen(sub_args)
                # Set the mode to the trainer
                self.MODE = "master"
                # Wait to leave time to slave to launch
                time.sleep(1)
            if self.LEXICAL:
                T = LexicalTrainer(P, C, self.MODE)
                T.do()
            elif self.CONTEXTUAL:
                T = ContextualTrainer(P, C, self.MODE)
                T.do()
            elif self.LEMMATIZER:
                T = LemmatizerTrainer(M, self.MODE)
                T.do()
            elif self.PMI:
                T = Thesaurus()
                G = GlobalPMITrainer(T, P, self.MODE)
                G.do()
            elif self.SEMANTICAL:
                T = Thesaurus()
                S = SemanticalTrainer(T, P, self.MODE)
                if self.PK:
                    # Should not have PK in MODE == "master"
                    a = config.content_model_getter(self.PK)
                    S.train(a)
                else:
                    if self.FORCE:
                        S.begin()
                    S.do(start=self.START)
            #                if TRAINER_MODE == "master" and FORCE:
            #                    S.clean_connections()
            if self.ADD_CANDIDATE:
                if not self.PK:
                    print "A PK is needed. Use -k xxx"
                else:
                    a = config.content_model_getter(self.PK)
                    t = getattr(a, config.SULCI_CONTENT_PROPERTY)
                    T = TextCorpus()
                    T.prepare(t, P, M)
                    T.export(self.PK, self.FORCE, self.ADD_LEMMES)
            if self.IPDB:
                import ipdb

                ipdb.set_trace()
 def log_error(self, token):
     sulci_logger.info(u"Error : %s, tagged %s instead of %s" \
                  % (unicode(token), token.tag, token.verified_tag), "WHITE")
Example #32
0
 def check(self, lexicon, check_lemmes=False):
     """
     Check the text of the corpus, and try to determine if there are some errors.
     Compare with lexicon.
     """
     sulci_logger.info(u"Checking text %s" % self.path, "YELLOW")
     found = False
     for t in self:
         if t in lexicon:
             # Check that current tag is in lexicon
             # If not, it *could* be an error, we display it
             if not t.verified_tag in lexicon[t]:
                 sulci_logger.info(u"Word in lexicon, but not this tag for %s (%s)" \
                                   % (unicode(t), t.verified_tag), "RED")
                 sulci_logger.info(u"In Lexicon : %s" % lexicon[t])
                 sulci_logger.info(u"Context : %s" % t.show_context(),
                                   "MAGENTA")
                 found = True
             if check_lemmes:
                 if t.verified_tag in lexicon[t] \
                          and t.verified_lemme != lexicon[t][t.verified_tag]:
                     sulci_logger.info(u"Word in lexicon, but not this lemme for %s (%s)" \
                                       % (unicode(t), t.verified_lemme), "BLUE")
                     sulci_logger.info(
                         u"In Lexicon : %s" % lexicon[t][t.verified_tag],
                         "GRAY")
                     sulci_logger.info(u"Context : %s" % t.show_context(),
                                       "YELLOW")
                 found = True
     if not found:
         sulci_logger.info(u"No error found", "YELLOW")
Example #33
0
 def compare_lists(self, valids, candidates):
     false_negatives = []
     false_positives = []
     true_positives = []
     sulci_logger.info("Expected", "YELLOW", True)
     sulci_logger.info(valids)
     sulci_logger.info("Output", "YELLOW", True)
     sulci_logger.info(candidates)
     valids_copy = valids
     for e in candidates:
         if e in valids:
             # One candidate was expected, good
             true_positives.append(e)
             valids_copy.remove(e)
         else:
             # This candidate was not expected
             false_positives.append(e)
     # Extected items not in candidates
     false_negatives = valids_copy
     sulci_logger.info("True positives", "YELLOW", True)
     sulci_logger.info(true_positives, "BLUE")
     sulci_logger.info("False positives", "YELLOW", True)
     sulci_logger.info(false_positives, "RED")
     sulci_logger.info("False negatives", "YELLOW", True)
     sulci_logger.info(false_negatives, "RED")
     score = 1.0 * (len(false_positives) + len(false_negatives)) / len(valids) * -1
     sulci_logger.info("Score", "YELLOW", True)
     sulci_logger.info(score, "RED", True)
     return score
 def log_error(self, token):
     sulci_logger.info(u"Error : %s, lemmatized %s instead of %s" \
         % (unicode(token), token.lemme, token.verified_lemme), "WHITE")
Example #35
0
 def get_entry(self, entry):
     if entry in self:
         sulci_logger.info(unicode(self[entry]), "WHITE")
     else:
         sulci_logger.info(u'No entry for "%s"' % entry, "WHITE")
Example #36
0
 def tokens(self):
     if self._tokens is None:
         sulci_logger.info("Loading Lemmatizer corpus...", "GREEN", True)
         self._samples, self._tokens = self.instantiate_text(self.content.split())
     return self._tokens
Example #37
0
 def tokens(self):
     if self._tokens is None:
         sulci_logger.info("Loading Lemmatizer corpus...", "GREEN", True)
         self._samples, self._tokens = self.instantiate_text(
             self.content.split())
     return self._tokens
Example #38
0
 def check(self, lexicon, check_lemmes=False):
     """
     Check the text of the corpus, and try to determine if there are some errors.
     Compare with lexicon.
     """
     sulci_logger.info(u"Checking text %s" % self.path, "YELLOW")
     found = False
     for t in self:
         if t in lexicon:
             # Check that current tag is in lexicon
             # If not, it *could* be an error, we display it
             if not t.verified_tag in lexicon[t]:
                 sulci_logger.info(u"Word in lexicon, but not this tag for %s (%s)" \
                                   % (unicode(t), t.verified_tag), "RED")
                 sulci_logger.info(u"In Lexicon : %s" % lexicon[t])
                 sulci_logger.info(u"Context : %s" % t.show_context(), "MAGENTA")
                 found = True
             if check_lemmes:
                 if t.verified_tag in lexicon[t] \
                          and t.verified_lemme != lexicon[t][t.verified_tag]:
                     sulci_logger.info(u"Word in lexicon, but not this lemme for %s (%s)" \
                                       % (unicode(t), t.verified_lemme), "BLUE")
                     sulci_logger.info(u"In Lexicon : %s" % lexicon[t][t.verified_tag], "GRAY")
                     sulci_logger.info(u"Context : %s" % t.show_context(), "YELLOW")
                 found = True
     if not found:
         sulci_logger.info(u"No error found", "YELLOW")
Example #39
0
 def compare_lists(self, valids, candidates):
     false_negatives = []
     false_positives = []
     true_positives = []
     sulci_logger.info("Expected", "YELLOW", True)
     sulci_logger.info(valids)
     sulci_logger.info("Output", "YELLOW", True)
     sulci_logger.info(candidates)
     valids_copy = valids
     for e in candidates:
         if e in valids:
             # One candidate was expected, good
             true_positives.append(e)
             valids_copy.remove(e)
         else:
             # This candidate was not expected
             false_positives.append(e)
     # Extected items not in candidates
     false_negatives = valids_copy
     sulci_logger.info("True positives", "YELLOW", True)
     sulci_logger.info(true_positives, "BLUE")
     sulci_logger.info("False positives", "YELLOW", True)
     sulci_logger.info(false_positives, "RED")
     sulci_logger.info("False negatives", "YELLOW", True)
     sulci_logger.info(false_negatives, "RED")
     score = 1.0 * (len(false_positives) +
                    len(false_negatives)) / len(valids) * -1
     sulci_logger.info("Score", "YELLOW", True)
     sulci_logger.info(score, "RED", True)
     return score