def merge_descriptors(origin, destination, force=False): """ Move all relations from a descriptor `origin` to another descriptor `destination`, and make `origin` an alias of `destination`. Takes all relations of a descriptor, add or create the weigth of these relations to the destination descriptor relations, and delete the origin descriptors relations. """ if origin == destination: raise ValueError("Origin and destination can't be equal !") origin_count = len(origin.synapses) destination_count = len(destination.synapses) if origin_count > destination_count and not force: raise ValueError( "Origin has more relations than destination, use force.") for relation in origin.synapses.instances(): sulci_logger.info(u"Handle relation %s" % unicode(relation)) trigger = relation.trigger score = relation.weight.hget() # We create or update the relation from the trigger to the destination # descriptor trigger.connect(destination, score) # Delete the original relation relation.delete() # Make origin an alias of destination origin.is_alias_of_id.hset(destination.pk.get())
def remove_orphans(cls): """ After cleaning connections, some trigger could remain without any connection. So delete it. """ for trigger in cls.instances().sort(): if len(trigger._synapses) == 0: sulci_logger.info(u"Removing trigger %s" % trigger) trigger.delete()
def check(self): """ Util method to try to individuate errors in the Lexicon. For this, we display the entries with several tags, in case they are wrong duplicate. """ for key, entity in self.items(): if len(entity.tags) > 1: sulci_logger.info(u"%s tags for %s" % (len(entity.tags), key), "RED") sulci_logger.info(entity.tags, "WHITE")
def display_errors(self): """ Display errors in current step. """ remaining_errors = self.get_errors() errors_count = len(remaining_errors) total_words = len(self.tokens) sulci_logger.info(u"Remaining %d errors (%f %% of %d total words)" % (errors_count, 100.0 * errors_count / total_words, total_words), "RED") for r_error in remaining_errors: self.log_error(r_error)
def remove_useless_connections(self, min=0.01): """ Delete all the useless connections. First loop on all the descriptors to consume less RAM. """ instances = TriggerToDescriptor.instances(descriptor_id=self.pk.get()) for inst in instances: weight = inst.pondered_weight if weight < min: sulci_logger.info("Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s (weight: %f)" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget(), weight)) inst.delete()
def test_rule(self, rule): template = self.get_template_instance(rule) bad = 0 good = 0 for ttk in self.tokens: test = template.test_rule(ttk, rule) if test == 1: good += 1 elif test == -1: bad += 1 sulci_logger.info(u"%s g: %d b : %d" % (rule, good, bad), "GRAY") return rule, good, bad
def train(self, inst): """ For the moment, human defined descriptors are a string with "," separator. """ if isinstance(inst, (int, str)): # We guess we have a pk here inst = config.content_model_getter(inst) text = getattr(inst, config.SULCI_CONTENT_PROPERTY) descriptors = config.descriptors_getter(inst) if not descriptors or not text: sulci_logger.info(u"Skipping item without data") return validated_descriptors = set() # Retrieve descriptors for d in descriptors: if not d: continue # d = d.strip().replace(u"’", u"'") # We create the descriptor not in thesaurus for now # because descriptors in article and thesaurus are not # always matching. Will be improved. dsc, created = Descriptor.get_or_connect(name=d) dsc.count.hincrby(1) # Retrieve the primeval value # dsc = dsc.primeval validated_descriptors.add(dsc) if created: sulci_logger.info(u"Lairning descriptor not in thesaurus : %s" % unicode(dsc), "RED") # Retrieve keytentities : try: S = SemanticalTagger( text, thesaurus=self.thesaurus, pos_tagger=self.pos_tagger, lexicon=self.pos_tagger.lexicon ) S.deduplicate_keyentities() # During lairning, try to filter except ValueError: # SemanticalTagger raise ValueError if text is empty return current_triggers = set() for ke in S.keyentities: # Retrieve or create triggers t, created = Trigger.get_or_connect(original=unicode(ke)) current_triggers.add(t) t.count.hincrby(1) # t.current_score = ke.trigger_score # For now, only create all the relations for d in validated_descriptors: for t in current_triggers: t.connect(d, 1)
def remove_useless_connections(self, min=0.01): """ Delete all the useless connections. First loop on all the descriptors to consume less RAM. """ instances = TriggerToDescriptor.instances(descriptor_id=self.pk.get()) for inst in instances: weight = inst.pondered_weight if weight < min: sulci_logger.info( "Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s (weight: %f)" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget(), weight)) inst.delete()
def slave(self): self.setup_socket_slave() while True: if self.subpoller.poll(0): rule = self.subsocket.recv()[1:] if rule == "stop": return rule = rule.decode("utf-8") template = self.get_template_instance(rule) #Apply the rule to the tokens sulci_logger.info(u"Applying rule %s" % rule, "RED") template.apply_rule(self.tokens, rule) if self.reppoller.poll(0): idx, action, rule = self.repsocket.recv_multipart() _, good, bad = self.test_rule(rule.decode("utf-8")) self.repsocket.send_multipart([idx, rule, str(good), str(bad)])
def test_rules(self, rules_candidates): pondered_rules = [] if self.mode == "master": # Send order for rule in rules_candidates: self.reqsocket.send_multipart(["check", rule.encode("utf-8")]) #Receive results for rule in rules_candidates: resp = self.reqsocket.recv_multipart() r, good, bad = resp pondered_rules.append((r.decode("utf-8"), int(good), int(bad))) sulci_logger.info(u"Received rule %s" % r.decode("utf-8"), "MAGENTA") sulci_logger.info(u"All rules are received from slaves") else: for rule in rules_candidates: pondered_rules.append(self.test_rule(rule)) return pondered_rules
def remove_unique_connections(cls): """ Delete all the connections which occurred one during training. First loop on all the descriptors to consume less RAM. """ for descriptor_id in Descriptor.collection(): instances = cls.instances(descriptor_id=descriptor_id) for inst in instances: try: weight = int(inst.weight.hget()) except TypeError: sulci_logger.info("Removing TriggerToDescriptor %s without weight, between Trigger %s and Descriptor %s" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget()), "RED") inst.delete() continue if weight <= 1: sulci_logger.info("Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget())) inst.delete()
def check_usage(self, word=None, tag=None, lemme=None, case_insensitive=False): """ Find occurrences of a word or tag or both in the corpus loaded. """ if not any((word, tag, lemme)): raise ValueError("You must specify at least a word, a tag or a lemme") found = False for t in self: # If a specific word is asked if word: original = t.original if case_insensitive: word = word.lower() original = original.lower() if not word == original: continue # If a specific tag is asked if tag and not tag == t.verified_tag: continue # don't care about texts without lemmes, when a lemme is asked if lemme: if not t.sample.parent.has_verified_lemmes: continue if not lemme == t.verified_lemme: continue sulci_logger.info("%s :" % unicode(t.sample.parent), "YELLOW") sulci_logger.info(t.show_context(), "WHITE") found = True if not found: not_found = u'No occurrence found for' if word: not_found += " %s" % word if tag: not_found += " %s" % tag sulci_logger.info(not_found, "RED")
def handle(self, *args, **options): C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) if self.WORD: self.WORD = self.WORD.decode("utf-8") if self.LEMME: self.LEMME = self.LEMME.decode("utf-8") if self.CHECK_LEXICON: if self.COUNT: sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE") elif self.WORD: L.get_entry(self.WORD) else: L.check() elif self.CHECK_CORPUS: if self.PATH: corpus = TextCorpus(self.PATH) else: corpus = C if self.COUNT: sulci_logger.info(u"Words in corpus : %d" % len(corpus), "WHITE") elif self.TAGS_STATS: corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE) elif self.WORD or self.TAG or self.LEMME: corpus.check_usage(word=self.WORD, tag=self.TAG, lemme=self.LEMME, case_insensitive=self.CASE_INSENSITIVE) else: corpus.check(L, self.USE_LEMMES) if self.DISPLAY_ERRORS: T = POSTrainer(P, C) T.display_errors() if self.IPDB: import ipdb ipdb.set_trace()
def handle(self, *args, **options): C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) if self.WORD: self.WORD = self.WORD.decode("utf-8") if self.LEMME: self.LEMME = self.LEMME.decode("utf-8") if self.CHECK_LEXICON: if self.COUNT: sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE") elif self.WORD: L.get_entry(self.WORD) else: L.check() elif self.CHECK_CORPUS: if self.PATH: corpus = TextCorpus(self.PATH) else: corpus = C if self.COUNT: sulci_logger.info(u"Words in corpus : %d" % len(corpus), "WHITE") elif self.TAGS_STATS: corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE) elif self.WORD or self.TAG or self.LEMME: corpus.check_usage( word=self.WORD, tag=self.TAG, lemme=self.LEMME, case_insensitive=self.CASE_INSENSITIVE ) else: corpus.check(L, self.USE_LEMMES) if self.DISPLAY_ERRORS: T = POSTrainer(P,C) T.display_errors() if self.IPDB: import ipdb; ipdb.set_trace()
def merge_descriptors(origin, destination, force=False): """ This facility take all relations of a descriptor, add or create the weigth of these relations to the destination descriptor relations, and delete the origin descriptors relations. """ if origin == destination: raise ValueError("Origin and destination can't be equal !") origin_count = origin.triggertodescriptor_set.count() destination_count = destination.triggertodescriptor_set.count() if origin_count > destination_count and not force: raise ValueError("Origin has more relations than destination, use force.") # We loop over the origin relations for relation in origin.triggertodescriptor_set.all(): sulci_logger.info(u"Handle relation %s" % unicode(relation)) trigger = relation.trigger score = relation.weight # We create or update the relation from the trigger to the destination # descriptor trigger.connect(destination, score) # Delete the original relation relation.delete()
def do(self): files = self.get_files(self.VALID_EXT) score = 0 for f in files: sulci_logger.info(" ******* File %s *******" % f, "CYAN", True) score += self.validate_file(f) sulci_logger.info(" ########## Final score ########## ", "CYAN", True) sulci_logger.info(score, "RED", True)
def remove_unique_connections(cls): """ Delete all the connections which occurred one during training. First loop on all the descriptors to consume less RAM. """ for descriptor_id in Descriptor.collection(): instances = cls.instances(descriptor_id=descriptor_id) for inst in instances: try: weight = int(inst.weight.hget()) except TypeError: sulci_logger.info( "Removing TriggerToDescriptor %s without weight, between Trigger %s and Descriptor %s" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget()), "RED") inst.delete() continue if weight <= 1: sulci_logger.info( "Removing TriggerToDescriptor %s, between Trigger %s and Descriptor %s" % (inst.pk.get(), inst.trigger_id.hget(), inst.descriptor_id.hget())) inst.delete()
def check_word(self, word): """ Find occurrences of a word in the corpus loaded. """ found = False for t in self.tokens: if word == t: sulci_logger.info("%s :" % unicode(t.sample.parent), "YELLOW") sulci_logger.info(t.show_context(), "WHITE") found = True if not found: sulci_logger.info(u'No occurrence found for "%s"' % word, "RED")
def check_text(self, lexicon, check_lemmes=False): """ Check the text of the corpus, and try to determine if there are some errors. Compare with lexicon. """ for t in self: if t in lexicon: # Check that current tag is in lexicon # If not, it *could* be an error, we display it if not t.verified_tag in lexicon[t]: sulci_logger.info(u"Word in lexicon, but not this tag for %s (%s)" \ % (unicode(t), t.verified_tag), "RED") sulci_logger.info(u"In Lexicon : %s" % lexicon[t]) if check_lemmes: if t.verified_tag in lexicon[t] \ and t.verified_lemme != lexicon[t][t.verified_tag]: sulci_logger.info(u"Word in lexicon, but not this lemme for %s (%s)" \ % (unicode(t), t.verified_lemme), "BLUE") sulci_logger.info(u"In Lexicon : %s" % lexicon[t][t.verified_tag], "GRAY")
def tags_stats(self): """ Display tags usage stats. """ d = defaultdict(int) for t in self: if t.verified_tag == None: sulci_logger.info(u"No verified tag for %s" % unicode(t), "RED", True) d[t.verified_tag] += 1 sulci_logger.info(u"Tag usage :", "WHITE") for k, v in sorted(d.iteritems(), key=itemgetter(1), reverse=True): sulci_logger.info(u"%s => %d" % (k, v), "CYAN")
def train(self): """ Main factorized train method. """ #We have to apply rules one after one to all objects sulci_logger.info("Begin of training session.", "WHITE", True) final_rules = [] errors = self.get_errors() while errors: run_applied_rule = False sulci_logger.info("%d errors for now..." % len(errors), "RED", True) for token_with_error in errors[:]: rules_candidates = [] self.log_error(token_with_error) # Make rules candidates for tpl, _ in self.template_generator.register.items(): # print "tpl", tpl template = self.get_template_instance(tpl) rules_candidates += template.make_rules(token_with_error) # Test the rules pondered_rules = self.test_rules(rules_candidates) # Select one rule rule_candidate, score = self.select_one_rule(pondered_rules) # Maybe the test "rule_candidate in final_rules" have to be done before... if rule_candidate and not rule_candidate in final_rules: # How to calculate the score min ? template = self.get_template_instance(rule_candidate) final_rules.append((rule_candidate, score)) # Apply the rule to the tokens sulci_logger.info(u"Applying rule %s (%s)" % (rule_candidate, score), "RED") template.apply_rule(self.tokens, rule_candidate) if self.mode == "master": # Send the rule to apply self.pubsocket.send(" %s" % rule_candidate.encode("utf-8")) run_applied_rule = True # We have applied a rule, we can try another run errors = self.get_errors() break # break the for else: # No rule applied for this error # We don't want to reprocess this error another time # unless the sample (so the context) as changed. token_with_error.sample.set_trained_position(token_with_error.position) if run_applied_rule: continue # go back to while errors = None # Nothing applied, we stop here. self.display_errors() self.template_generator.export(final_rules)
def handle(self, *args): if not self.PK: sulci_logger.info(u"A PK is needed. Use -k xxx", "RED") else: C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) a = config.content_model_getter(self.PK) t = getattr(a, config.SULCI_CONTENT_PROPERTY) T = Thesaurus() S = SemanticalTagger(t, T, P, lexicon=L) if __debug__: S.debug() sulci_logger.info(u"Scored descriptors", "YELLOW", True) for d, value in S.descriptors: sulci_logger.info(u"%s %f" % (unicode(d), value), "BLUE") if self.IPDB: import ipdb; ipdb.set_trace()
def check_usage(self, word=None, tag=None, lemme=None, case_insensitive=False): """ Find occurrences of a word or tag or both in the corpus loaded. """ if not any((word, tag, lemme)): raise ValueError( "You must specify at least a word, a tag or a lemme") found = False for t in self: # If a specific word is asked if word: original = t.original if case_insensitive: word = word.lower() original = original.lower() if not word == original: continue # If a specific tag is asked if tag and not tag == t.verified_tag: continue # don't care about texts without lemmes, when a lemme is asked if lemme: if not t.sample.parent.has_verified_lemmes: continue if not lemme == t.verified_lemme: continue sulci_logger.info("%s :" % unicode(t.sample.parent), "YELLOW") sulci_logger.info(t.show_context(), "WHITE") found = True if not found: not_found = u'No occurrence found for' if word: not_found += " %s" % word if tag: not_found += " %s" % tag sulci_logger.info(not_found, "RED")
def tags_stats(self, word=None, case_insensitive=None): """ Display tags usage stats. """ d = defaultdict(int) for t in self: if word: original = t.original if case_insensitive: word = word.lower() original = original.lower() if not word == original: continue if t.verified_tag == None: sulci_logger.info(u"No verified tag for %s" % unicode(t), "RED", True) d[t.verified_tag] += 1 log = u"Tag usage :" if word: log = u"Tag usage for word '%s'" % word sulci_logger.info(log, "WHITE") for k, v in sorted(d.iteritems(), key=itemgetter(1), reverse=True): sulci_logger.info(u"%s => %d" % (k, v), "CYAN")
def get_entry(self, entry): if entry in self: sulci_logger.info(unicode(self[entry]), "WHITE") else: sulci_logger.info(u'No entry for "%s"' % entry, "WHITE")
def handle(self, *args, **options): with UseDB(config.TRAINING_DATABASE): sulci_logger.info(u"STARTING TRAINING WITH DATABASE «%s»" % config.TRAINING_DATABASE, "RED", True) C = Corpus() L = Lexicon() M = Lemmatizer(L) P = PosTagger(lexicon=L) if self.LEXICON: L.make(self.FORCE) if self.SUBPROCESSES: import subprocess training_kind = ( self.LEXICAL and "-e" or self.LEMMATIZER and "-r" or self.SEMANTICAL and "-n" or self.PMI and "-p" or "-c" ) # CONTEXTUAL # Create slaves for i in xrange(0, self.SUBPROCESSES): sulci_logger.info(u"Opening slave subprocess %d" % i, "BLUE", True) sub_args = ["sulci_train.py", training_kind, "--mode=slave"] if self.START is not None: sub_args.append("--start=%s" % self.START) subprocess.Popen(sub_args) # Set the mode to the trainer self.MODE = "master" # Wait to leave time to slave to launch time.sleep(1) if self.LEXICAL: T = LexicalTrainer(P, C, self.MODE) T.do() elif self.CONTEXTUAL: T = ContextualTrainer(P, C, self.MODE) T.do() elif self.LEMMATIZER: T = LemmatizerTrainer(M, self.MODE) T.do() elif self.PMI: T = Thesaurus() G = GlobalPMITrainer(T, P, self.MODE) G.do() elif self.SEMANTICAL: T = Thesaurus() S = SemanticalTrainer(T, P, self.MODE) if self.PK: # Should not have PK in MODE == "master" a = config.content_model_getter(self.PK) S.train(a) else: if self.FORCE: S.begin() S.do(start=self.START) # if TRAINER_MODE == "master" and FORCE: # S.clean_connections() if self.ADD_CANDIDATE: if not self.PK: print "A PK is needed. Use -k xxx" else: a = config.content_model_getter(self.PK) t = getattr(a, config.SULCI_CONTENT_PROPERTY) T = TextCorpus() T.prepare(t, P, M) T.export(self.PK, self.FORCE, self.ADD_LEMMES) if self.IPDB: import ipdb ipdb.set_trace()
def log_error(self, token): sulci_logger.info(u"Error : %s, tagged %s instead of %s" \ % (unicode(token), token.tag, token.verified_tag), "WHITE")
def check(self, lexicon, check_lemmes=False): """ Check the text of the corpus, and try to determine if there are some errors. Compare with lexicon. """ sulci_logger.info(u"Checking text %s" % self.path, "YELLOW") found = False for t in self: if t in lexicon: # Check that current tag is in lexicon # If not, it *could* be an error, we display it if not t.verified_tag in lexicon[t]: sulci_logger.info(u"Word in lexicon, but not this tag for %s (%s)" \ % (unicode(t), t.verified_tag), "RED") sulci_logger.info(u"In Lexicon : %s" % lexicon[t]) sulci_logger.info(u"Context : %s" % t.show_context(), "MAGENTA") found = True if check_lemmes: if t.verified_tag in lexicon[t] \ and t.verified_lemme != lexicon[t][t.verified_tag]: sulci_logger.info(u"Word in lexicon, but not this lemme for %s (%s)" \ % (unicode(t), t.verified_lemme), "BLUE") sulci_logger.info( u"In Lexicon : %s" % lexicon[t][t.verified_tag], "GRAY") sulci_logger.info(u"Context : %s" % t.show_context(), "YELLOW") found = True if not found: sulci_logger.info(u"No error found", "YELLOW")
def compare_lists(self, valids, candidates): false_negatives = [] false_positives = [] true_positives = [] sulci_logger.info("Expected", "YELLOW", True) sulci_logger.info(valids) sulci_logger.info("Output", "YELLOW", True) sulci_logger.info(candidates) valids_copy = valids for e in candidates: if e in valids: # One candidate was expected, good true_positives.append(e) valids_copy.remove(e) else: # This candidate was not expected false_positives.append(e) # Extected items not in candidates false_negatives = valids_copy sulci_logger.info("True positives", "YELLOW", True) sulci_logger.info(true_positives, "BLUE") sulci_logger.info("False positives", "YELLOW", True) sulci_logger.info(false_positives, "RED") sulci_logger.info("False negatives", "YELLOW", True) sulci_logger.info(false_negatives, "RED") score = 1.0 * (len(false_positives) + len(false_negatives)) / len(valids) * -1 sulci_logger.info("Score", "YELLOW", True) sulci_logger.info(score, "RED", True) return score
def log_error(self, token): sulci_logger.info(u"Error : %s, lemmatized %s instead of %s" \ % (unicode(token), token.lemme, token.verified_lemme), "WHITE")
def tokens(self): if self._tokens is None: sulci_logger.info("Loading Lemmatizer corpus...", "GREEN", True) self._samples, self._tokens = self.instantiate_text(self.content.split()) return self._tokens
def tokens(self): if self._tokens is None: sulci_logger.info("Loading Lemmatizer corpus...", "GREEN", True) self._samples, self._tokens = self.instantiate_text( self.content.split()) return self._tokens
def check(self, lexicon, check_lemmes=False): """ Check the text of the corpus, and try to determine if there are some errors. Compare with lexicon. """ sulci_logger.info(u"Checking text %s" % self.path, "YELLOW") found = False for t in self: if t in lexicon: # Check that current tag is in lexicon # If not, it *could* be an error, we display it if not t.verified_tag in lexicon[t]: sulci_logger.info(u"Word in lexicon, but not this tag for %s (%s)" \ % (unicode(t), t.verified_tag), "RED") sulci_logger.info(u"In Lexicon : %s" % lexicon[t]) sulci_logger.info(u"Context : %s" % t.show_context(), "MAGENTA") found = True if check_lemmes: if t.verified_tag in lexicon[t] \ and t.verified_lemme != lexicon[t][t.verified_tag]: sulci_logger.info(u"Word in lexicon, but not this lemme for %s (%s)" \ % (unicode(t), t.verified_lemme), "BLUE") sulci_logger.info(u"In Lexicon : %s" % lexicon[t][t.verified_tag], "GRAY") sulci_logger.info(u"Context : %s" % t.show_context(), "YELLOW") found = True if not found: sulci_logger.info(u"No error found", "YELLOW")