def __init__(self, text, pos_tagger=None, lemmatizer=None, lexicon=None): self._raw_text = text self.normalized_text = normalize_text(text) if len(self.normalized_text) == 0: # For now, raise value error, because an empty text create # too much problems here and there (zero division, etc.) # TODO : make empty texts possible. raise ValueError("Can't process an empty text.") self.samples = [] self.keyentities = [] self.lexicon = lexicon or Lexicon() self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon) self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon) self.make() self._stemms = None
def __init__(self, text, thesaurus=None, pos_tagger=None, lemmatizer=None, lexicon=None): self.thesaurus = thesaurus or Thesaurus() if isinstance(text, StemmedText): self.text = text else: self.text = StemmedText(text, pos_tagger, lemmatizer, lexicon) self.keyentities = [] self.lexicon = lexicon or Lexicon() self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon) self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon) self.make_keyentities() self._triggers = None self._stemms = None
def handle(self, *args, **options): C = Corpus() L = Lexicon() P = PosTagger(lexicon=L) M = Lemmatizer(L) if self.WORD: self.WORD = self.WORD.decode("utf-8") if self.LEMME: self.LEMME = self.LEMME.decode("utf-8") if self.CHECK_LEXICON: if self.COUNT: sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE") elif self.WORD: L.get_entry(self.WORD) else: L.check() elif self.CHECK_CORPUS: if self.PATH: corpus = TextCorpus(self.PATH) else: corpus = C if self.COUNT: sulci_logger.info(u"Words in corpus : %d" % len(corpus), "WHITE") elif self.TAGS_STATS: corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE) elif self.WORD or self.TAG or self.LEMME: corpus.check_usage(word=self.WORD, tag=self.TAG, lemme=self.LEMME, case_insensitive=self.CASE_INSENSITIVE) else: corpus.check(L, self.USE_LEMMES) if self.DISPLAY_ERRORS: T = POSTrainer(P, C) T.display_errors() if self.IPDB: import ipdb ipdb.set_trace()