Exemple #1
0
 def handle(self, *args, **options):
     C = Corpus()
     L = Lexicon()
     P = PosTagger(lexicon=L)
     M = Lemmatizer(L)
     if self.WORD:
         self.WORD = self.WORD.decode("utf-8")
     if self.LEMME:
         self.LEMME = self.LEMME.decode("utf-8")
     if self.CHECK_LEXICON:
         if self.COUNT:
             sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE")
         elif self.WORD:
             L.get_entry(self.WORD)
         else:
             L.check()
     elif self.CHECK_CORPUS:
         if self.PATH:
             corpus = TextCorpus(self.PATH)
         else:
             corpus = C
         if self.COUNT:
             sulci_logger.info(u"Words in corpus : %d" % len(corpus), "WHITE")
         elif self.TAGS_STATS:
             corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE)
         elif self.WORD or self.TAG or self.LEMME:
             corpus.check_usage(
                 word=self.WORD, 
                 tag=self.TAG, 
                 lemme=self.LEMME,
                 case_insensitive=self.CASE_INSENSITIVE
             )
         else:
             corpus.check(L, self.USE_LEMMES)
     if self.DISPLAY_ERRORS:
         T = POSTrainer(P,C)
         T.display_errors()
     if self.IPDB:
         import ipdb; ipdb.set_trace()
Exemple #2
0
 def __init__(self, text, pos_tagger=None, lemmatizer=None, lexicon=None):
     self._raw_text = text
     self.normalized_text = normalize_text(text)
     if len(self.normalized_text) == 0:
         # For now, raise value error, because an empty text create
         # too much problems here and there (zero division, etc.)
         # TODO : make empty texts possible.
         raise ValueError("Can't process an empty text.")
     self.samples = []
     self.keyentities = []
     self.lexicon = lexicon or Lexicon()
     self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon)
     self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon)
     self.make()
     self._stemms = None
Exemple #3
0
 def __init__(self,
              text,
              thesaurus=None,
              pos_tagger=None,
              lemmatizer=None,
              lexicon=None):
     self.thesaurus = thesaurus or Thesaurus()
     if isinstance(text, StemmedText):
         self.text = text
     else:
         self.text = StemmedText(text, pos_tagger, lemmatizer, lexicon)
     self.keyentities = []
     self.lexicon = lexicon or Lexicon()
     self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon)
     self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon)
     self.make_keyentities()
     self._triggers = None
     self._stemms = None
Exemple #4
0
 def handle(self, *args, **options):
     C = Corpus()
     L = Lexicon()
     P = PosTagger(lexicon=L)
     M = Lemmatizer(L)
     if self.WORD:
         self.WORD = self.WORD.decode("utf-8")
     if self.LEMME:
         self.LEMME = self.LEMME.decode("utf-8")
     if self.CHECK_LEXICON:
         if self.COUNT:
             sulci_logger.info(u"Words in lexicon : %d" % len(L), "WHITE")
         elif self.WORD:
             L.get_entry(self.WORD)
         else:
             L.check()
     elif self.CHECK_CORPUS:
         if self.PATH:
             corpus = TextCorpus(self.PATH)
         else:
             corpus = C
         if self.COUNT:
             sulci_logger.info(u"Words in corpus : %d" % len(corpus),
                               "WHITE")
         elif self.TAGS_STATS:
             corpus.tags_stats(self.WORD, self.CASE_INSENSITIVE)
         elif self.WORD or self.TAG or self.LEMME:
             corpus.check_usage(word=self.WORD,
                                tag=self.TAG,
                                lemme=self.LEMME,
                                case_insensitive=self.CASE_INSENSITIVE)
         else:
             corpus.check(L, self.USE_LEMMES)
     if self.DISPLAY_ERRORS:
         T = POSTrainer(P, C)
         T.display_errors()
     if self.IPDB:
         import ipdb
         ipdb.set_trace()
Exemple #5
0
    def handle(self, *args, **options):
        with UseDB(config.TRAINING_DATABASE):
            sulci_logger.info(u"STARTING TRAINING WITH DATABASE «%s»" % config.TRAINING_DATABASE, "RED", True)
            C = Corpus()
            L = Lexicon()
            M = Lemmatizer(L)
            P = PosTagger(lexicon=L)
            if self.LEXICON:
                L.make(self.FORCE)
            if self.SUBPROCESSES:
                import subprocess

                training_kind = (
                    self.LEXICAL
                    and "-e"
                    or self.LEMMATIZER
                    and "-r"
                    or self.SEMANTICAL
                    and "-n"
                    or self.PMI
                    and "-p"
                    or "-c"
                )  # CONTEXTUAL
                # Create slaves
                for i in xrange(0, self.SUBPROCESSES):
                    sulci_logger.info(u"Opening slave subprocess %d" % i, "BLUE", True)
                    sub_args = ["sulci_train.py", training_kind, "--mode=slave"]
                    if self.START is not None:
                        sub_args.append("--start=%s" % self.START)
                    subprocess.Popen(sub_args)
                # Set the mode to the trainer
                self.MODE = "master"
                # Wait to leave time to slave to launch
                time.sleep(1)
            if self.LEXICAL:
                T = LexicalTrainer(P, C, self.MODE)
                T.do()
            elif self.CONTEXTUAL:
                T = ContextualTrainer(P, C, self.MODE)
                T.do()
            elif self.LEMMATIZER:
                T = LemmatizerTrainer(M, self.MODE)
                T.do()
            elif self.PMI:
                T = Thesaurus()
                G = GlobalPMITrainer(T, P, self.MODE)
                G.do()
            elif self.SEMANTICAL:
                T = Thesaurus()
                S = SemanticalTrainer(T, P, self.MODE)
                if self.PK:
                    # Should not have PK in MODE == "master"
                    a = config.content_model_getter(self.PK)
                    S.train(a)
                else:
                    if self.FORCE:
                        S.begin()
                    S.do(start=self.START)
            #                if TRAINER_MODE == "master" and FORCE:
            #                    S.clean_connections()
            if self.ADD_CANDIDATE:
                if not self.PK:
                    print "A PK is needed. Use -k xxx"
                else:
                    a = config.content_model_getter(self.PK)
                    t = getattr(a, config.SULCI_CONTENT_PROPERTY)
                    T = TextCorpus()
                    T.prepare(t, P, M)
                    T.export(self.PK, self.FORCE, self.ADD_LEMMES)
            if self.IPDB:
                import ipdb

                ipdb.set_trace()