def handle(self, *args, **options): C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) if self.WORD: self.WORD = self.WORD.decode("utf-8") if self.LEMME: self.LEMME = self.LEMME.decode("utf-8") if self.CHECK_LEXICON: if self.COUNT: sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE") elif self.WORD: L.get_entry(self.WORD) else: L.check() elif self.CHECK_CORPUS: if self.PATH: corpus = TextCorpus(self.PATH) else: corpus = C if self.COUNT: sulci_logger.info(u"Words in corpus : %d" % len(corpus), "WHITE") elif self.TAGS_STATS: corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE) elif self.WORD or self.TAG or self.LEMME: corpus.check_usage( word=self.WORD, tag=self.TAG, lemme=self.LEMME, case_insensitive=self.CASE_INSENSITIVE ) else: corpus.check(L, self.USE_LEMMES) if self.DISPLAY_ERRORS: T = POSTrainer(P,C) T.display_errors() if self.IPDB: import ipdb; ipdb.set_trace()
def __init__(self, text, pos_tagger=None, lemmatizer=None, lexicon=None): self._raw_text = text self.normalized_text = normalize_text(text) if len(self.normalized_text) == 0: # For now, raise value error, because an empty text create # too much problems here and there (zero division, etc.) # TODO : make empty texts possible. raise ValueError("Can't process an empty text.") self.samples = [] self.keyentities = [] self.lexicon = lexicon or Lexicon() self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon) self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon) self.make() self._stemms = None
def __init__(self, text, thesaurus=None, pos_tagger=None, lemmatizer=None, lexicon=None): self.thesaurus = thesaurus or Thesaurus() if isinstance(text, StemmedText): self.text = text else: self.text = StemmedText(text, pos_tagger, lemmatizer, lexicon) self.keyentities = [] self.lexicon = lexicon or Lexicon() self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon) self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon) self.make_keyentities() self._triggers = None self._stemms = None
def handle(self, *args, **options): C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) if self.WORD: self.WORD = self.WORD.decode("utf-8") if self.LEMME: self.LEMME = self.LEMME.decode("utf-8") if self.CHECK_LEXICON: if self.COUNT: sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE") elif self.WORD: L.get_entry(self.WORD) else: L.check() elif self.CHECK_CORPUS: if self.PATH: corpus = TextCorpus(self.PATH) else: corpus = C if self.COUNT: sulci_logger.info(u"Words in corpus : %d" % len(corpus), "WHITE") elif self.TAGS_STATS: corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE) elif self.WORD or self.TAG or self.LEMME: corpus.check_usage(word=self.WORD, tag=self.TAG, lemme=self.LEMME, case_insensitive=self.CASE_INSENSITIVE) else: corpus.check(L, self.USE_LEMMES) if self.DISPLAY_ERRORS: T = POSTrainer(P, C) T.display_errors() if self.IPDB: import ipdb ipdb.set_trace()
def handle(self, *args, **options): with UseDB(config.TRAINING_DATABASE): sulci_logger.info(u"STARTING TRAINING WITH DATABASE «%s»" % config.TRAINING_DATABASE, "RED", True) C = Corpus() L = Lexicon() M = Lemmatizer(L) P = PosTagger(lexicon=L) if self.LEXICON: L.make(self.FORCE) if self.SUBPROCESSES: import subprocess training_kind = ( self.LEXICAL and "-e" or self.LEMMATIZER and "-r" or self.SEMANTICAL and "-n" or self.PMI and "-p" or "-c" ) # CONTEXTUAL # Create slaves for i in xrange(0, self.SUBPROCESSES): sulci_logger.info(u"Opening slave subprocess %d" % i, "BLUE", True) sub_args = ["sulci_train.py", training_kind, "--mode=slave"] if self.START is not None: sub_args.append("--start=%s" % self.START) subprocess.Popen(sub_args) # Set the mode to the trainer self.MODE = "master" # Wait to leave time to slave to launch time.sleep(1) if self.LEXICAL: T = LexicalTrainer(P, C, self.MODE) T.do() elif self.CONTEXTUAL: T = ContextualTrainer(P, C, self.MODE) T.do() elif self.LEMMATIZER: T = LemmatizerTrainer(M, self.MODE) T.do() elif self.PMI: T = Thesaurus() G = GlobalPMITrainer(T, P, self.MODE) G.do() elif self.SEMANTICAL: T = Thesaurus() S = SemanticalTrainer(T, P, self.MODE) if self.PK: # Should not have PK in MODE == "master" a = config.content_model_getter(self.PK) S.train(a) else: if self.FORCE: S.begin() S.do(start=self.START) # if TRAINER_MODE == "master" and FORCE: # S.clean_connections() if self.ADD_CANDIDATE: if not self.PK: print "A PK is needed. Use -k xxx" else: a = config.content_model_getter(self.PK) t = getattr(a, config.SULCI_CONTENT_PROPERTY) T = TextCorpus() T.prepare(t, P, M) T.export(self.PK, self.FORCE, self.ADD_LEMMES) if self.IPDB: import ipdb ipdb.set_trace()