def train(self, inst): """ For the moment, human defined descriptors are a string with "," separator. """ if isinstance(inst, (int, str)): # We guess we have a pk here inst = config.content_model_getter(inst) text = getattr(inst, config.SULCI_CONTENT_PROPERTY) descriptors = config.descriptors_getter(inst) if not descriptors or not text: sulci_logger.info(u"Skipping item without data") return validated_descriptors = set() # Retrieve descriptors for d in descriptors: if not d: continue # d = d.strip().replace(u"’", u"'") # We create the descriptor not in thesaurus for now # because descriptors in article and thesaurus are not # always matching. Will be improved. dsc, created = Descriptor.get_or_connect(name=d) dsc.count.hincrby(1) # Retrieve the primeval value # dsc = dsc.primeval validated_descriptors.add(dsc) if created: sulci_logger.info(u"Lairning descriptor not in thesaurus : %s" % unicode(dsc), "RED") # Retrieve keytentities : try: S = SemanticalTagger( text, thesaurus=self.thesaurus, pos_tagger=self.pos_tagger, lexicon=self.pos_tagger.lexicon ) S.deduplicate_keyentities() # During lairning, try to filter except ValueError: # SemanticalTagger raise ValueError if text is empty return current_triggers = set() for ke in S.keyentities: # Retrieve or create triggers t, created = Trigger.get_or_connect(original=unicode(ke)) current_triggers.add(t) t.count.hincrby(1) # t.current_score = ke.trigger_score # For now, only create all the relations for d in validated_descriptors: for t in current_triggers: t.connect(d, 1)
def handle(self, *args): if not self.PK: sulci_logger.info(u"A PK is needed. Use -k xxx", "RED") else: C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) a = config.content_model_getter(self.PK) t = getattr(a, config.SULCI_CONTENT_PROPERTY) T = Thesaurus() S = SemanticalTagger(t, T, P, lexicon=L) if __debug__: S.debug() sulci_logger.info(u"Scored descriptors", "YELLOW", True) for d, value in S.descriptors: sulci_logger.info(u"%s %f" % (unicode(d), value), "BLUE") if self.IPDB: import ipdb; ipdb.set_trace()
def train(self, inst): if isinstance(inst, (int, str)): # We guess we have a pk here inst = config.content_model_getter(inst) text = getattr(inst, config.SULCI_CONTENT_PROPERTY) try: S = SemanticalTagger( text, thesaurus=self.thesaurus, pos_tagger=self.pos_tagger, lexicon=self.pos_tagger.lexicon ) S.deduplicate_keyentities() # During lairning, try to filter except ValueError: # SemanticalTagger raise ValueError if text is empty return # We want also the unigrams # Note that the stopwords will not be returned ngrams = S.ngrams(min_length=1, max_length=5) for key, values in ngrams.iteritems(): self.global_pmi.add_ngram(values['stemms'], amount=values['count'])
def handle(self, *args, **options): with UseDB(config.TRAINING_DATABASE): sulci_logger.info(u"STARTING TRAINING WITH DATABASE «%s»" % config.TRAINING_DATABASE, "RED", True) C = Corpus() L = Lexicon() M = Lemmatizer(L) P = PosTagger(lexicon=L) if self.LEXICON: L.make(self.FORCE) if self.SUBPROCESSES: import subprocess training_kind = ( self.LEXICAL and "-e" or self.LEMMATIZER and "-r" or self.SEMANTICAL and "-n" or self.PMI and "-p" or "-c" ) # CONTEXTUAL # Create slaves for i in xrange(0, self.SUBPROCESSES): sulci_logger.info(u"Opening slave subprocess %d" % i, "BLUE", True) sub_args = ["sulci_train.py", training_kind, "--mode=slave"] if self.START is not None: sub_args.append("--start=%s" % self.START) subprocess.Popen(sub_args) # Set the mode to the trainer self.MODE = "master" # Wait to leave time to slave to launch time.sleep(1) if self.LEXICAL: T = LexicalTrainer(P, C, self.MODE) T.do() elif self.CONTEXTUAL: T = ContextualTrainer(P, C, self.MODE) T.do() elif self.LEMMATIZER: T = LemmatizerTrainer(M, self.MODE) T.do() elif self.PMI: T = Thesaurus() G = GlobalPMITrainer(T, P, self.MODE) G.do() elif self.SEMANTICAL: T = Thesaurus() S = SemanticalTrainer(T, P, self.MODE) if self.PK: # Should not have PK in MODE == "master" a = config.content_model_getter(self.PK) S.train(a) else: if self.FORCE: S.begin() S.do(start=self.START) # if TRAINER_MODE == "master" and FORCE: # S.clean_connections() if self.ADD_CANDIDATE: if not self.PK: print "A PK is needed. Use -k xxx" else: a = config.content_model_getter(self.PK) t = getattr(a, config.SULCI_CONTENT_PROPERTY) T = TextCorpus() T.prepare(t, P, M) T.export(self.PK, self.FORCE, self.ADD_LEMMES) if self.IPDB: import ipdb ipdb.set_trace()