Exemple #1
0
def parse_test(params):
    if params.test == "parser":
        print ("Running " + params.test)
        print ("==PARAMETERS==")
        print ("EMBEDDINGS: " + params.embeddings)
        print ("MODEL FILE: " + params.model_base)
        print ("DECODER: " + params.decoder)
        print ("OUTPUT: " + params.output_file)
        print ("CONFIG FILE: " + str(params.config))
        print ("==============\n")

        testset = Dataset(params.test_file)
        encodings = Encodings()
        encodings.load(params.model_base + ".encodings")
        encodings.update_wordlist(testset)
        print ("Updated word list: " + str(len(encodings.word_list)))
        config = ParserConfig(filename=params.config)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        parser = BDRNNParser(config, encodings, embeddings)
        parser.load(params.model_base + ".bestUAS")
        if params.decoder == 'mst':
            print ("!!!!!!!!!!!!!!!!!!!!!!!!!USING MST DECODER")
            from graph.decoders import MSTDecoder
            parser.decoder = MSTDecoder()
        f = fopen(params.output_file, "w")
        last_proc = 0
        index = 0
        for seq in testset.sequences:
            index += 1
            proc = index * 100 / len(testset.sequences)
            if proc % 5 == 0 and proc != last_proc:
                last_proc = proc
                sys.stdout.write(" " + str(proc))
                sys.stdout.flush()

            rez = parser.tag(seq)
            iSeq = 0
            iRez = 0
            while iSeq < len(seq):
                while seq[iSeq].is_compound_entry:
                    iSeq += 1
                seq[iSeq].xpos = rez[iRez].xpos
                seq[iSeq].upos = rez[iRez].upos
                seq[iSeq].attrs = rez[iRez].attrs
                seq[iSeq].head = rez[iRez].head
                seq[iSeq].label = rez[iRez].label
                seq[iSeq].lemma = rez[iRez].lemma
                iSeq += 1
                iRez += 1

            for entry in seq:
                f.write(str(entry.index) + "\t" + str(entry.word) + "\t" + str(entry.lemma) + "\t" + str(
                    entry.upos) + "\t" + str(entry.xpos) + "\t" + str(entry.attrs) + "\t" + str(
                    entry.head) + "\t" + str(entry.label) + "\t" + str(entry.deps) + "\t" + str(
                    entry.space_after) + "\n")
            f.write("\n")

        f.close()
        sys.stdout.write("\n")
Exemple #2
0
def parse_run(params):
    sys.stdout.write("\nINPUT FILE: " + params.input_file)
    sys.stdout.write("\nOUTPUT FILE: " + params.output_file)
    sys.stdout.write("\nMODELS FILE: " + params.models + "\n")
    sys.stdout.flush()

    components = params.run.split(",")
    tokenize = True if "tokenizer" in components else False
    compound = True if "compound" in components else False
    lemmatize = True if "lemmatizer" in components else False
    tag = True if "tagger" in components else False
    parse = True if "parser" in components else False

    # common elements load
    sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n")
    embeddings = WordEmbeddings()
    embeddings.read_from_file(params.embeddings, None)

    encodings = None
    if tokenize == True:
        if not os.path.isfile(
                os.path.join(params.models, "tokenizer-tok.bestAcc")):
            sys.stdout.write(
                "\n\tTokenizer model not found! (" +
                os.path.join(params.models, "tokenizer-tok.bestAcc") + ")")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTokenization enabled.\n")
        tokenizer_encodings = Encodings(verbose=False)
        tokenizer_encodings.load(
            os.path.join(params.models, "tokenizer.encodings"))
    if compound == True:
        if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")):
            sys.stdout.write("\n\tCompound word expander model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tCompound word expander enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if lemmatize == True:
        if not os.path.isfile(os.path.join(params.models,
                                           "lemmatizer.bestACC")):
            sys.stdout.write("\n\tLemmatization model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tLemmatization enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if tag == True:
        if not os.path.isfile(os.path.join(params.models,
                                           "tagger.bestOVERALL")):
            sys.stdout.write("\n\tTagger model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTagger enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "tagger.encodings"))
    if parse == True:
        if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")):
            sys.stdout.write("\n\tParser model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tParser enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "parser.encodings"))

    sequences = None
    if tokenize:
        sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()

        from io_utils.config import TieredTokenizerConfig
        from generic_networks.tokenizers import TieredTokenizer
        config = TieredTokenizerConfig(
            os.path.join(params.models, "tokenizer.conf"))
        tokenizer_object = TieredTokenizer(config,
                                           tokenizer_encodings,
                                           embeddings,
                                           runtime=True)
        tokenizer_object.load(os.path.join(params.models, "tokenizer"))

        with open(params.input_file, 'r') as file:
            lines = file.readlines()
        # analyze use of spaces in first part of the file
        test = ""
        useSpaces = " "
        cnt = 0
        while True:
            test = test + lines[cnt]
            # print(lines[cnt])
            if cnt >= len(lines) or cnt > 5:
                break
            cnt += 1
        if float(test.count(' ')) / float(len(test)) < 0.02:
            useSpaces = ""
        # print (str(float(test.count(' '))/float(len(test))))
        i = -1
        input_string = ""
        sequences = []
        while i < len(lines) - 1:
            i += 1
            input_string = input_string + lines[i].replace("\r", "").replace(
                "\n", "").strip() + useSpaces
            if lines[i].strip() == "" or i == len(lines) - 1:  # end of block
                if input_string.strip() != "":
                    sequences += tokenizer_object.tokenize(input_string)
                input_string = ""

        del tokenizer_object  # free memory
    else:
        ds = Dataset(params.input_file)
        sequences = ds.sequences
    sys.stdout.write(" done\n")
    sys.stdout.flush()

    if compound:
        sys.stdout.write("\nCompound word expanding " + params.input_file +
                         " ... \n\t")
        sys.stdout.flush()
        from generic_networks.token_expanders import CompoundWordExpander
        from io_utils.config import CompoundWordConfig
        config = CompoundWordConfig(
            os.path.join(params.models, "compound.conf"))
        compoundwordexpander_object = CompoundWordExpander(config,
                                                           encodings,
                                                           embeddings,
                                                           runtime=True)
        compoundwordexpander_object.load(
            os.path.join(params.models, "compound.bestAcc"))
        sequences = compoundwordexpander_object.expand_sequences(sequences)
        del compoundwordexpander_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if parse == True:
        sys.stdout.write("\nParsing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import ParserConfig
        from generic_networks.parsers import BDRNNParser
        config = ParserConfig(os.path.join(params.models, "parser.conf"))
        parser_object = BDRNNParser(config,
                                    encodings,
                                    embeddings,
                                    runtime=True)
        parser_object.load(os.path.join(params.models, "parser.bestUAS"))
        sequences = parser_object.parse_sequences(sequences)
        del parser_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if tag == True:
        sys.stdout.write("\nTagging " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import TaggerConfig
        from generic_networks.taggers import BDRNNTagger
        config = TaggerConfig(os.path.join(params.models, "tagger.conf"))
        tagger_object_UPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS"))
        tagger_object_XPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS"))
        tagger_object_ATTRS = BDRNNTagger(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        tagger_object_ATTRS.load(
            os.path.join(params.models, "tagger.bestATTRS"))

        new_sequences = []
        for sequence in sequences:
            new_sequence = copy.deepcopy(sequence)
            predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence)
            predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence)
            predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence)
            for entryIndex in range(len(sequence)):
                new_sequence[entryIndex].upos = predicted_tags_UPOS[
                    entryIndex][0]
                new_sequence[entryIndex].xpos = predicted_tags_XPOS[
                    entryIndex][1]
                new_sequence[entryIndex].attrs = predicted_tags_ATTRS[
                    entryIndex][2]
            new_sequences.append(new_sequence)
        sequences = copy.deepcopy(new_sequences)
        del tagger_object_UPOS  # free memory
        del tagger_object_XPOS  # free memory
        del tagger_object_ATTRS  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if lemmatize:
        sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from generic_networks.lemmatizers import FSTLemmatizer
        from io_utils.config import LemmatizerConfig
        config = LemmatizerConfig(
            os.path.join(params.models, "lemmatizer.conf"))
        lemmatizer_object = FSTLemmatizer(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        lemmatizer_object.load(
            os.path.join(params.models, "lemmatizer.bestACC"))
        sequences = lemmatizer_object.lemmatize_sequences(sequences)
        del lemmatizer_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    output_dataset = Dataset()
    output_dataset.sequences = sequences
    output_dataset.write(params.output_file)
Exemple #3
0
def parse_train(params):
    if params.train == 'mt':
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "SRC TRAIN FILE: " + params.mt_train_src
        print "SRC DEV FILE: " + params.mt_dev_src
        print "SRC TEST FILE: " + str(params.mt_test_src)
        print "SRC EMBEDDINGS FILE: " + params.mt_source_embeddings
        print "DST TRAIN FILE: " + params.mt_train_dst
        print "DST DEV FILE: " + params.mt_dev_dst
        print "DST TEST FILE: " + str(params.mt_test_dst)
        print "DST EMBEDDINGS FILE: " + params.mt_destination_embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"
        trainset = MTDataset(params.mt_train_src, params.mt_train_dst)
        devset = MTDataset(params.mt_dev_src, params.mt_dev_dst)
        if params.mt_test_src and params.mt_test_dst:
            testset = MTDataset(params.mt_test_src, params.mt_test_dst)
        else:
            testset = None

        config = NMTConfig(params.config)
        sys.stdout.write("--SOURCE--\n")
        sys.stdout.flush()
        src_enc = Encodings()
        src_enc.compute(trainset.to_conll_dataset('src'),
                        devset.to_conll_dataset('src'),
                        word_cutoff=5)
        sys.stdout.write("--DESTINATION--\n")
        sys.stdout.flush()
        dst_enc = Encodings()
        dst_enc.compute(trainset.to_conll_dataset('dst'),
                        devset.to_conll_dataset('dst'),
                        word_cutoff=5)
        sys.stdout.write("Reading source embeddings\n")
        src_we = WordEmbeddings()
        src_we.read_from_file(params.mt_source_embeddings,
                              'label',
                              full_load=False)
        sys.stdout.write("Reading destination embeddings\n")
        dst_we = WordEmbeddings()
        dst_we.read_from_file(params.mt_destination_embeddings,
                              'label',
                              full_load=False)
        nmt = BRNNMT(src_we, dst_we, src_enc, dst_enc, config)
        trainer = MTTrainer(nmt,
                            src_enc,
                            dst_enc,
                            src_we,
                            dst_we,
                            params.itters,
                            trainset,
                            devset,
                            testset=testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    if params.train == "tagger":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = TaggerConfig(params.config)
        if not config._valid:
            return

        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        tagger = BDRNNTagger(config,
                             encodings,
                             embeddings,
                             aux_softmax_weight=params.aux_softmax_weight)
        trainer = TaggerTrainer(tagger, encodings, params.itters, trainset,
                                devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "parser":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = ParserConfig(params.config)
        if not config._valid:
            return
        # PARAM INJECTION
        if params.params != None:
            parts = params.params.split(":")
            for param in parts:
                variable = param.split("=")[0]
                value = param[len(variable) + 1:]
                print("External param injection: " + variable + "=" + value)
                exec("config.__dict__[\"" + variable + "\"] = " + value)
                # END INJECTION
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        parser = BDRNNParser(config,
                             encodings,
                             embeddings,
                             aux_softmax_weight=params.aux_softmax_weight)
        trainer = ParserTrainer(parser, encodings, params.itters, trainset,
                                devset, testset)
        trainer.start_training(params.output_base, params.batch_size)

    elif params.train == "lemmatizer":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = LemmatizerConfig(params.config)
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)

        embeddings = None
        lemmatizer = FSTLemmatizer(config, encodings, embeddings)
        trainer = LemmatizerTrainer(lemmatizer, encodings, params.itters,
                                    trainset, devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "compound":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = CompoundWordConfig(params.config)
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)

        embeddings = None
        expander = CompoundWordExpander(config, encodings, embeddings)
        trainer = CompoundWordTrainer(expander, encodings, params.itters,
                                      trainset, devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "tokenizer":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "RAW TRAIN FILE: " + (params.raw_train_file if params.
                                    raw_train_file is not None else "n/a")
        print "DEV FILE: " + params.dev_file
        print "RAW DEV FILE: " + (params.raw_dev_file if params.raw_dev_file
                                  is not None else "n/a")
        print "TEST FILE: " + (params.test_file
                               if params.test_file is not None else "n/a")
        print "RAW TEST FILE: " + (params.raw_test_file if params.raw_test_file
                                   is not None else "n/a")
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"
        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        from generic_networks.tokenizers import TieredTokenizer
        config = TieredTokenizerConfig(params.config)
        config.raw_test_file = params.raw_test_file
        config.base = params.output_base
        config.patience = params.itters
        if not config._valid:
            return

        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(
            params.embeddings, None
        )  # setting wordlist to None triggers Word Embeddings to act as cache-only and load offsets for all words
        tokenizer = TieredTokenizer(config, encodings, embeddings)
        trainer = TokenizerTrainer(tokenizer,
                                   encodings,
                                   params.itters,
                                   trainset,
                                   devset,
                                   testset,
                                   raw_train_file=params.raw_train_file,
                                   raw_dev_file=params.raw_dev_file,
                                   raw_test_file=params.raw_test_file,
                                   gold_train_file=params.train_file,
                                   gold_dev_file=params.dev_file,
                                   gold_test_file=params.test_file)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)
Exemple #4
0
    def load(self,
             language_code,
             version="latest",
             tokenization=True,
             compound_word_expanding=False,
             tagging=True,
             lemmatization=True,
             parsing=True):
        """
        Loads the pipeline with all available models for the target language.

        @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
        @param version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc .
       
        """
        # Initialize a ModelStore object
        model_store_object = ModelStore(disk_path=self._model_repository)

        # Find a local model or download it if it does not exist, returning the local model folder path
        model_folder_path = model_store_object.find(lang_code=language_code,
                                                    version=version,
                                                    verbose=self._verbose)

        # Load metadata from the model
        self.metadata.read(os.path.join(model_folder_path, "metadata.json"))

        # Load embeddings
        embeddings = WordEmbeddings(verbose=False)
        if self._verbose:
            sys.stdout.write('\tLoading embeddings... \n')
        embeddings.read_from_file(os.path.join(
            self._embeddings_repository, self.metadata.embeddings_file_name),
                                  None,
                                  full_load=False)

        # 1. Load tokenizer
        if tokenization:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'tokenizer-tok.bestAcc')):
                sys.stdout.write(
                    '\tTokenization is not available on this model. \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading tokenization model ...\n')
                tokenizer_encodings = Encodings(verbose=False)
                tokenizer_encodings.load(
                    os.path.join(model_folder_path, 'tokenizer.encodings'))
                config = TieredTokenizerConfig(
                    os.path.join(model_folder_path, 'tokenizer.conf'))
                self._tokenizer = TieredTokenizer(config,
                                                  tokenizer_encodings,
                                                  embeddings,
                                                  runtime=True)
                self._tokenizer.load(
                    os.path.join(model_folder_path, 'tokenizer'))

        # 3. Load compound
        if compound_word_expanding:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'compound.bestAcc')):
                if self._verbose:  # supress warning here because many languages do not have compund words
                    sys.stdout.write(
                        '\tCompound word expansion is not available on this model. \n'
                    )
            else:
                if self._verbose:
                    sys.stdout.write(
                        '\tLoading compound word expander model ...\n')
                compound_encodings = Encodings(verbose=False)
                compound_encodings.load(
                    os.path.join(model_folder_path, 'compound.encodings'))
                config = CompoundWordConfig(
                    os.path.join(model_folder_path, 'compound.conf'))
                self._compound_word_expander = CompoundWordExpander(
                    config, compound_encodings, embeddings, runtime=True)
                self._compound_word_expander.load(
                    os.path.join(model_folder_path, 'compound.bestAcc'))

        # 4. Load lemmatizer
        if lemmatization:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'lemmatizer.bestACC')):
                sys.stdout.write(
                    '\tLemmatizer is not available on this model. \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading lemmatization model ...\n')
                lemmatizer_encodings = Encodings(verbose=False)
                lemmatizer_encodings.load(
                    os.path.join(model_folder_path, 'lemmatizer.encodings'))
                config = LemmatizerConfig(
                    os.path.join(model_folder_path, 'lemmatizer.conf'))
                self._lemmatizer = FSTLemmatizer(config,
                                                 lemmatizer_encodings,
                                                 embeddings,
                                                 runtime=True)
                self._lemmatizer.load(
                    os.path.join(model_folder_path, 'lemmatizer.bestACC'))

        # 5. Load taggers
        if tagging or lemmatization:  # we need tagging for lemmatization
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'tagger.bestUPOS')):
                sys.stdout.write(
                    '\tTagging is not available on this model. \n')
                if lemmatization:
                    sys.stdout.write(
                        '\t\tDisabling the lemmatization model due to missing tagger. \n'
                    )
                    self._lemmatizer = None
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading tagger model ...\n')
                tagger_encodings = Encodings(verbose=False)
                tagger_encodings.load(
                    os.path.join(model_folder_path, 'tagger.encodings'))
                config = TaggerConfig(
                    os.path.join(model_folder_path, 'tagger.conf'))
                self._tagger = [None, None, None]
                self._tagger[0] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[0].load(
                    os.path.join(model_folder_path, 'tagger.bestUPOS'))
                self._tagger[1] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[1].load(
                    os.path.join(model_folder_path, 'tagger.bestXPOS'))
                self._tagger[2] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[2].load(
                    os.path.join(model_folder_path, 'tagger.bestATTRS'))

        # 6. Load parser
        if parsing:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'parser.bestUAS')):
                sys.stdout.write(
                    '\tParsing is not available on this model... \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading parser model ...\n')
                parser_encodings = Encodings(verbose=False)
                parser_encodings.load(
                    os.path.join(model_folder_path, 'parser.encodings'))
                config = ParserConfig(
                    os.path.join(model_folder_path, 'parser.conf'))
                self._parser = BDRNNParser(config,
                                           parser_encodings,
                                           embeddings,
                                           runtime=True)
                self._parser.load(
                    os.path.join(model_folder_path, 'parser.bestUAS'))

        self._loaded = True
        if self._verbose:
            sys.stdout.write('Model loading complete.\n\n')
Exemple #5
0
    def load(self, lang_code, base_path=None):
        """
        Loads the pipeline with all available models for the target language
        @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
        @param base_path: Base path for models. Only required for custom-trained models. Otherwise, just leave this parameter untouched to use the default model location
        @return: True if loading was successful, False otherwise
        """
        sys.stdout.write('Loading models for ' + lang_code + "\n")
        if base_path is None:
            global BASE_PATH
            base_path = BASE_PATH

        self.embeddings = WordEmbeddings()
        self.embeddings.read_from_file(os.path.join(base_path, lang_code + "/wiki." + lang_code + ".vec"), None,
                                       full_load=False)
        if not os.path.isfile(os.path.join(base_path, lang_code + "/tokenizer-tok.bestAcc")):
            sys.stdout.write(
                "\tTokenization disabled. \n")
        else:
            self.tokenizer_enabled = True
            sys.stdout.write("\tTokenization enabled.\n")
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(os.path.join(base_path, lang_code + "/tokenizer.encodings"))
            from io_utils.config import TieredTokenizerConfig
            from generic_networks.tokenizers import TieredTokenizer
            config = TieredTokenizerConfig(os.path.join(base_path, lang_code + "/tokenizer.conf"))
            tokenizer_object = TieredTokenizer(config, tokenizer_encodings, self.embeddings, runtime=True)
            tokenizer_object.load(os.path.join(base_path, lang_code + "/tokenizer"))
            self.models[PipelineComponents.TOKENIZER] = tokenizer_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/compound.bestAcc")):
            sys.stdout.write(
                "\tCompound disabled. \n")
        else:
            self.compound_enabled = True
            sys.stdout.write("\tCompound enabled.\n")
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(os.path.join(base_path, lang_code + "/compound.encodings"))
            from io_utils.config import CompoundWordConfig
            from generic_networks.token_expanders import CompoundWordExpander
            config = CompoundWordConfig(os.path.join(base_path, lang_code + "/compound.conf"))
            compound_object = CompoundWordExpander(config, compound_encodings, self.embeddings, runtime=True)
            compound_object.load(os.path.join(base_path, lang_code + "/compound.bestAcc"))
            self.models[PipelineComponents.COMPOUND] = compound_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/lemmatizer.bestACC")):
            sys.stdout.write(
                "\tLemmatizer disabled. \n")
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write("\tLemmatizer enabled.\n")
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/lemmatizer.encodings"))
            from io_utils.config import LemmatizerConfig
            from generic_networks.lemmatizers import FSTLemmatizer
            config = LemmatizerConfig(os.path.join(base_path, lang_code + "/lemmatizer.conf"))
            lemmatizer_object = FSTLemmatizer(config, lemmatizer_encodings, self.embeddings, runtime=True)
            lemmatizer_object.load(os.path.join(base_path, lang_code + "/lemmatizer.bestACC"))
            self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object

        if not os.path.isfile(os.path.join(base_path, lang_code + "/tagger.bestUPOS")):
            sys.stdout.write(
                "\tTagger disabled. \n")
        else:
            self.tagger_enabled = True
            sys.stdout.write("\tTagger enabled.\n")
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(os.path.join(base_path, lang_code + "/tagger.encodings"))
            from io_utils.config import TaggerConfig
            from generic_networks.taggers import BDRNNTagger
            config = TaggerConfig(os.path.join(base_path, lang_code + "/tagger.conf"))

            tagger_upos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_upos_object.load(os.path.join(base_path, lang_code + "/tagger.bestUPOS"))
            tagger_xpos_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_xpos_object.load(os.path.join(base_path, tagger_encodings + "/tagger.bestXPOS"))
            tagger_attrs_object = BDRNNTagger(config, tagger_encodings, self.embeddings, runtime=True)
            tagger_attrs_object.load(os.path.join(base_path, lang_code + "/tagger.bestATTRS"))

            self.models[PipelineComponents.TAGGER] = [tagger_upos_object, tagger_xpos_object, tagger_attrs_object]

        if not os.path.isfile(os.path.join(base_path, lang_code + "/parser.bestUAS")):
            sys.stdout.write(
                "\tParser disabled. \n")
        else:
            self.parser_enabled = True
            sys.stdout.write("\tParser enabled.\n")
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(os.path.join(base_path, lang_code + "/parser.encodings"))
            from io_utils.config import ParserConfig
            from generic_networks.parsers import BDRNNParser
            config = ParserConfig(os.path.join(base_path, lang_code + "/parser.conf"))
            parser_object = BDRNNParser(config, lemmatizer_encodings, self.embeddings, runtime=True)
            parser_object.load(os.path.join(base_path, lang_code + "/parser.bestUAS"))
            self.models[PipelineComponents.PARSER] = parser_object
    def __init__(self, embeddings, port=80, tokenization=None, lemma=None, tagging=None, parsing=None):
        global singletonServer
        singletonServer = self
        if tokenization is not None:
            sys.stdout.write("Loading tokenization model from " + tokenization)
            sys.stdout.flush()
            from generic_networks.tokenizers import TieredTokenizer
            from io_utils.config import TieredTokenizerConfig
            from io_utils.encodings import Encodings
            tok_encodings = Encodings()
            tok_encodings.load(tokenization + ".encodings")
            tok_config = TieredTokenizerConfig()
            tok_config.load(tokenization + ".conf")
            self.tokenizer = TieredTokenizer(tok_config, tok_encodings, embeddings, runtime=True)
            self.tokenizer.load(tokenization)

        if parsing is not None:
            sys.stdout.write("Loading parsing model from " + parsing)
            from generic_networks.parsers import BDRNNParser
            from io_utils.config import ParserConfig
            from io_utils.encodings import Encodings
            parse_encodings = Encodings()
            parse_encodings.load(parsing + ".encodings")
            parse_config = ParserConfig()
            parse_config.load(parsing + ".conf")
            self.parser = BDRNNParser(parse_config, parse_encodings, embeddings, runtime=True)
            self.parser.load(parsing + ".bestUAS")

        if lemma is not None:
            sys.stdout.write("Loading lemma model from " + lemma)
            from generic_networks.lemmatizers import BDRNNLemmatizer
            from io_utils.config import LemmatizerConfig
            from io_utils.encodings import Encodings
            lemma_encodings = Encodings()
            lemma_encodings.load(lemma + ".encodings")
            lemma_config = LemmatizerConfig()
            lemma_config.load(lemma + ".conf")
            self.lemmatizer = BDRNNLemmatizer(lemma_config, lemma_encodings, embeddings, runtime=True)
            self.lemmatizer.load(lemma + ".bestACC")
        else:
            self.lemmatizer = None

        global app
        app.run(port=port)
        self.port = port
    def _load(self, lang_code):
        """
        Load models on the class.
        """
        sys.stdout.write('Loading models for {}\n'.format(lang_code))
        path_for_language = os.path.join(self.disk_path, lang_code)

        # 1. Load word embeddings.
        self.embeddings = WordEmbeddings()
        word_embeddings_for_language = 'wiki.{}.vec'.format(lang_code)
        self.embeddings.read_from_file(os.path.join(
            path_for_language, word_embeddings_for_language),
                                       None,
                                       full_load=False)

        # 2. Load tokenizer.
        if not os.path.isfile(
                os.path.join(path_for_language, 'tokenizer-tok.bestAcc')):
            sys.stdout.write('\tTokenization disabled. \n')
        else:
            self.tokenizer_enabled = True
            sys.stdout.write('\tTokenization enabled.\n')
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(
                os.path.join(path_for_language, 'tokenizer.encodings'))
            config = TieredTokenizerConfig(
                os.path.join(path_for_language, 'tokenizer.conf'))
            tokenizer_object = TieredTokenizer(config,
                                               tokenizer_encodings,
                                               self.embeddings,
                                               runtime=True)
            tokenizer_object.load(os.path.join(path_for_language, 'tokenizer'))
            self.models[PipelineComponents.TOKENIZER] = tokenizer_object

        # 3. Load compound.
        if not os.path.isfile(
                os.path.join(path_for_language, 'compound.bestAcc')):
            sys.stdout.write('\tCompound disabled. \n')
        else:
            self.compound_enabled = True
            sys.stdout.write('\tCompound enabled.\n')
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(
                os.path.join(path_for_language, 'compound.encodings'))
            config = CompoundWordConfig(
                os.path.join(path_for_language, 'compound.conf'))
            compound_object = CompoundWordExpander(config,
                                                   compound_encodings,
                                                   self.embeddings,
                                                   runtime=True)
            compound_object.load(
                os.path.join(path_for_language, 'compound.bestAcc'))
            self.models[PipelineComponents.COMPOUND] = compound_object

        if not os.path.isfile(
                os.path.join(path_for_language, 'lemmatizer.bestACC')):
            sys.stdout.write('\tLemmatizer disabled. \n')
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write('\tLemmatizer enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(path_for_language, 'lemmatizer.encodings'))
            config = LemmatizerConfig(
                os.path.join(path_for_language, 'lemmatizer.conf'))
            lemmatizer_object = FSTLemmatizer(config,
                                              lemmatizer_encodings,
                                              self.embeddings,
                                              runtime=True)
            lemmatizer_object.load(
                os.path.join(path_for_language, 'lemmatizer.bestACC'))
            self.models[PipelineComponents.LEMMATIZER] = lemmatizer_object

        if not os.path.isfile(
                os.path.join(path_for_language, 'tagger.bestUPOS')):
            sys.stdout.write('\tTagger disabled. \n')
        else:
            self.tagger_enabled = True
            sys.stdout.write('\tTagger enabled.\n')
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(
                os.path.join(path_for_language, 'tagger.encodings'))
            config = TaggerConfig(
                os.path.join(path_for_language, 'tagger.conf'))

            tagger_upos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_upos_object.load(
                os.path.join(path_for_language, 'tagger.bestUPOS'))
            tagger_xpos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_xpos_object.load(
                os.path.join(path_for_language, 'tagger.bestXPOS'))
            tagger_attrs_object = BDRNNTagger(config,
                                              tagger_encodings,
                                              self.embeddings,
                                              runtime=True)
            tagger_attrs_object.load(
                os.path.join(path_for_language, 'tagger.bestATTRS'))

            self.models[PipelineComponents.TAGGER] = [
                tagger_upos_object, tagger_xpos_object, tagger_attrs_object
            ]

        if not os.path.isfile(os.path.join(path_for_language,
                                           'parser.bestUAS')):
            sys.stdout.write('\tParser disabled. \n')
        else:
            self.parser_enabled = True
            sys.stdout.write('\tParser enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(path_for_language, 'parser.encodings'))
            config = ParserConfig(
                os.path.join(path_for_language, 'parser.conf'))
            parser_object = BDRNNParser(config,
                                        lemmatizer_encodings,
                                        self.embeddings,
                                        runtime=True)
            parser_object.load(
                os.path.join(path_for_language, 'parser.bestUAS'))
            self.models[PipelineComponents.PARSER] = parser_object
    def _load(self, lang_code, version):
        """
        Load models on the class.
        """
        # Refresh metadata
        self.metadata.read(
            os.path.join(self.disk_path, lang_code + "-" + str(version),
                         "metadata.json"))
        model_folder = os.path.join(self.disk_path,
                                    lang_code + "-" + str(version))
        embeddings_folder = os.path.join(self.disk_path, "embeddings")
        embeddings_file_path = os.path.join(embeddings_folder,
                                            self.metadata.embeddings_file_name)

        #sys.stdout.write('Loading model for {}-{}\n'.format(lang_code,version))

        # 1. Load word embeddings
        self.embeddings = WordEmbeddings(verbose=False)
        sys.stdout.write('\tLoading embeddings... \n')
        self.embeddings.read_from_file(embeddings_file_path,
                                       None,
                                       full_load=False)

        # 2. Load tokenizer
        if not os.path.isfile(
                os.path.join(model_folder, 'tokenizer-tok.bestAcc')):
            sys.stdout.write('\tTokenization disabled. \n')
        else:
            self.tokenizer_enabled = True
            sys.stdout.write('\tTokenization enabled.\n')
            tokenizer_encodings = Encodings(verbose=False)
            tokenizer_encodings.load(
                os.path.join(model_folder, 'tokenizer.encodings'))
            config = TieredTokenizerConfig(
                os.path.join(model_folder, 'tokenizer.conf'))
            tokenizer_object = TieredTokenizer(config,
                                               tokenizer_encodings,
                                               self.embeddings,
                                               runtime=True)
            tokenizer_object.load(os.path.join(model_folder, 'tokenizer'))
            self.model[PipelineComponents.TOKENIZER] = tokenizer_object

        # 3. Load compound
        if not os.path.isfile(os.path.join(model_folder, 'compound.bestAcc')):
            sys.stdout.write('\tCompound disabled. \n')
        else:
            self.compound_enabled = True
            sys.stdout.write('\tCompound enabled.\n')
            compound_encodings = Encodings(verbose=False)
            compound_encodings.load(
                os.path.join(model_folder, 'compound.encodings'))
            config = CompoundWordConfig(
                os.path.join(model_folder, 'compound.conf'))
            compound_object = CompoundWordExpander(config,
                                                   compound_encodings,
                                                   self.embeddings,
                                                   runtime=True)
            compound_object.load(os.path.join(model_folder,
                                              'compound.bestAcc'))
            self.model[PipelineComponents.COMPOUND] = compound_object

        # 4. Load lemmatizer
        if not os.path.isfile(os.path.join(model_folder,
                                           'lemmatizer.bestACC')):
            sys.stdout.write('\tLemmatizer disabled. \n')
        else:
            self.lemmatizer_enabled = True
            sys.stdout.write('\tLemmatizer enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(model_folder, 'lemmatizer.encodings'))
            config = LemmatizerConfig(
                os.path.join(model_folder, 'lemmatizer.conf'))
            lemmatizer_object = FSTLemmatizer(config,
                                              lemmatizer_encodings,
                                              self.embeddings,
                                              runtime=True)
            lemmatizer_object.load(
                os.path.join(model_folder, 'lemmatizer.bestACC'))
            self.model[PipelineComponents.LEMMATIZER] = lemmatizer_object

        # 5. Load taggers
        if not os.path.isfile(os.path.join(model_folder, 'tagger.bestUPOS')):
            sys.stdout.write('\tTagger disabled. \n')
        else:
            self.tagger_enabled = True
            sys.stdout.write('\tTagger enabled.\n')
            tagger_encodings = Encodings(verbose=False)
            tagger_encodings.load(
                os.path.join(model_folder, 'tagger.encodings'))
            config = TaggerConfig(os.path.join(model_folder, 'tagger.conf'))

            tagger_upos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_upos_object.load(
                os.path.join(model_folder, 'tagger.bestUPOS'))
            tagger_xpos_object = BDRNNTagger(config,
                                             tagger_encodings,
                                             self.embeddings,
                                             runtime=True)
            tagger_xpos_object.load(
                os.path.join(model_folder, 'tagger.bestXPOS'))
            tagger_attrs_object = BDRNNTagger(config,
                                              tagger_encodings,
                                              self.embeddings,
                                              runtime=True)
            tagger_attrs_object.load(
                os.path.join(model_folder, 'tagger.bestATTRS'))

            self.model[PipelineComponents.TAGGER] = [
                tagger_upos_object, tagger_xpos_object, tagger_attrs_object
            ]

        # 6. Load parser
        if not os.path.isfile(os.path.join(model_folder, 'parser.bestUAS')):
            sys.stdout.write('\tParser disabled. \n')
        else:
            self.parser_enabled = True
            sys.stdout.write('\tParser enabled.\n')
            lemmatizer_encodings = Encodings(verbose=False)
            lemmatizer_encodings.load(
                os.path.join(model_folder, 'parser.encodings'))
            config = ParserConfig(os.path.join(model_folder, 'parser.conf'))
            parser_object = BDRNNParser(config,
                                        lemmatizer_encodings,
                                        self.embeddings,
                                        runtime=True)
            parser_object.load(os.path.join(model_folder, 'parser.bestUAS'))
            self.model[PipelineComponents.PARSER] = parser_object
Exemple #9
0
def train(train_file, dev_file, model_base, patience):
    from io_utils.sigmorphon import Sigmorphon2CONLL
    from io_utils.conll import Dataset

    ds_train = Sigmorphon2CONLL()
    ds_train.read_from_file(train_file)
    ds_train = ds_train.convert2conll()

    ds_dev = Sigmorphon2CONLL()
    ds_dev.read_from_file(dev_file)
    ds_dev = ds_dev.convert2conll()

    sys.stdout.write("Train file has " + str(len(ds_train.sequences)) +
                     " sequences\n")
    sys.stdout.write("Dev file has " + str(len(ds_dev.sequences)) +
                     " sequences\n")

    from io_utils.encodings import Encodings
    encodings = Encodings()
    encodings.compute(ds_train, ds_dev)
    sys.stdout.write("Storing encodings in " + model_base + ".encodings\n")
    encodings.save(model_base + ".encodings")

    num_itt_no_improve = patience
    best_dev_acc = 0

    from models.lemmatizers import FSTLemmatizer
    from models.lemmatizers import BDRNNLemmatizer
    from models.config import LemmatizerConfig

    config = LemmatizerConfig()

    config.save(model_base + ".config")

    lemmatizer = FSTLemmatizer(config, encodings, None, runtime=False)
    epoch = 0
    batch_size = 10
    while num_itt_no_improve > 0:

        epoch += 1
        sys.stdout.write("Starting epoch " + str(epoch) + "\n")
        sys.stdout.flush()
        sys.stdout.write("\tshuffling training data... ")
        sys.stdout.flush()

        shuffle(ds_train.sequences)
        sys.stdout.write("done\n")
        sys.stdout.flush()
        last_proc = 0
        sys.stdout.write("\ttraining...")
        sys.stdout.flush()
        total_loss = 0
        start_time = time.time()
        current_batch_size = 0
        lemmatizer.start_batch()
        for iSeq in xrange(len(ds_train.sequences)):
            seq = ds_train.sequences[iSeq]
            proc = (iSeq + 1) * 100 / len(ds_train.sequences)
            if proc % 5 == 0 and proc != last_proc:
                last_proc = proc
                sys.stdout.write(" " + str(proc))
                sys.stdout.flush()

            lemmatizer.learn(seq)
            current_batch_size += len(seq)
            if current_batch_size >= batch_size:
                total_loss += lemmatizer.end_batch()
                lemmatizer.start_batch()
                current_batch_size = 0
        total_loss += lemmatizer.end_batch()

        stop_time = time.time()
        sys.stdout.write(" avg_loss=" +
                         str(total_loss / len(ds_train.sequences)) +
                         " execution_time=" + str(stop_time - start_time) +
                         "\n")
        sys.stdout.write("\tevaluating")
        dev_acc = eval(lemmatizer, ds_dev, model_base + ".log")
        sys.stdout.write(" devset accuracy is " + str(dev_acc) + "\n")
        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            lemmatizer.save(model_base + ".bestAcc")
            num_itt_no_improve = patience
        lemmatizer.save(model_base + ".last")
        num_itt_no_improve -= 1