Ejemplo n.º 1
0
 def __init__(self, text, pos_tagger=None, lemmatizer=None, lexicon=None):
     self._raw_text = text
     self.normalized_text = normalize_text(text)
     if len(self.normalized_text) == 0:
         # For now, raise value error, because an empty text create
         # too much problems here and there (zero division, etc.)
         # TODO : make empty texts possible.
         raise ValueError("Can't process an empty text.")
     self.samples = []
     self.keyentities = []
     self.lexicon = lexicon or Lexicon()
     self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon)
     self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon)
     self.make()
     self._stemms = None
Ejemplo n.º 2
0
 def __init__(self,
              text,
              thesaurus=None,
              pos_tagger=None,
              lemmatizer=None,
              lexicon=None):
     self.thesaurus = thesaurus or Thesaurus()
     if isinstance(text, StemmedText):
         self.text = text
     else:
         self.text = StemmedText(text, pos_tagger, lemmatizer, lexicon)
     self.keyentities = []
     self.lexicon = lexicon or Lexicon()
     self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon)
     self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon)
     self.make_keyentities()
     self._triggers = None
     self._stemms = None
Ejemplo n.º 3
0
 def handle(self, *args, **options):
     C = Corpus()
     L = Lexicon()
     P = PosTagger(lexicon=L)
     M = Lemmatizer(L)
     if self.WORD:
         self.WORD = self.WORD.decode("utf-8")
     if self.LEMME:
         self.LEMME = self.LEMME.decode("utf-8")
     if self.CHECK_LEXICON:
         if self.COUNT:
             sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE")
         elif self.WORD:
             L.get_entry(self.WORD)
         else:
             L.check()
     elif self.CHECK_CORPUS:
         if self.PATH:
             corpus = TextCorpus(self.PATH)
         else:
             corpus = C
         if self.COUNT:
             sulci_logger.info(u"Words in corpus : %d" % len(corpus),
                               "WHITE")
         elif self.TAGS_STATS:
             corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE)
         elif self.WORD or self.TAG or self.LEMME:
             corpus.check_usage(word=self.WORD,
                                tag=self.TAG,
                                lemme=self.LEMME,
                                case_insensitive=self.CASE_INSENSITIVE)
         else:
             corpus.check(L, self.USE_LEMMES)
     if self.DISPLAY_ERRORS:
         T = POSTrainer(P, C)
         T.display_errors()
     if self.IPDB:
         import ipdb
         ipdb.set_trace()