Esempio n. 1
0
def parse_test(params):
    if params.test == "parser":
        print ("Running " + params.test)
        print ("==PARAMETERS==")
        print ("EMBEDDINGS: " + params.embeddings)
        print ("MODEL FILE: " + params.model_base)
        print ("DECODER: " + params.decoder)
        print ("OUTPUT: " + params.output_file)
        print ("CONFIG FILE: " + str(params.config))
        print ("==============\n")

        testset = Dataset(params.test_file)
        encodings = Encodings()
        encodings.load(params.model_base + ".encodings")
        encodings.update_wordlist(testset)
        print ("Updated word list: " + str(len(encodings.word_list)))
        config = ParserConfig(filename=params.config)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        parser = BDRNNParser(config, encodings, embeddings)
        parser.load(params.model_base + ".bestUAS")
        if params.decoder == 'mst':
            print ("!!!!!!!!!!!!!!!!!!!!!!!!!USING MST DECODER")
            from graph.decoders import MSTDecoder
            parser.decoder = MSTDecoder()
        f = fopen(params.output_file, "w")
        last_proc = 0
        index = 0
        for seq in testset.sequences:
            index += 1
            proc = index * 100 / len(testset.sequences)
            if proc % 5 == 0 and proc != last_proc:
                last_proc = proc
                sys.stdout.write(" " + str(proc))
                sys.stdout.flush()

            rez = parser.tag(seq)
            iSeq = 0
            iRez = 0
            while iSeq < len(seq):
                while seq[iSeq].is_compound_entry:
                    iSeq += 1
                seq[iSeq].xpos = rez[iRez].xpos
                seq[iSeq].upos = rez[iRez].upos
                seq[iSeq].attrs = rez[iRez].attrs
                seq[iSeq].head = rez[iRez].head
                seq[iSeq].label = rez[iRez].label
                seq[iSeq].lemma = rez[iRez].lemma
                iSeq += 1
                iRez += 1

            for entry in seq:
                f.write(str(entry.index) + "\t" + str(entry.word) + "\t" + str(entry.lemma) + "\t" + str(
                    entry.upos) + "\t" + str(entry.xpos) + "\t" + str(entry.attrs) + "\t" + str(
                    entry.head) + "\t" + str(entry.label) + "\t" + str(entry.deps) + "\t" + str(
                    entry.space_after) + "\n")
            f.write("\n")

        f.close()
        sys.stdout.write("\n")
Esempio n. 2
0
def parse_train(params):
    if params.train == 'mt':
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "SRC TRAIN FILE: " + params.mt_train_src
        print "SRC DEV FILE: " + params.mt_dev_src
        print "SRC TEST FILE: " + str(params.mt_test_src)
        print "SRC EMBEDDINGS FILE: " + params.mt_source_embeddings
        print "DST TRAIN FILE: " + params.mt_train_dst
        print "DST DEV FILE: " + params.mt_dev_dst
        print "DST TEST FILE: " + str(params.mt_test_dst)
        print "DST EMBEDDINGS FILE: " + params.mt_destination_embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"
        trainset = MTDataset(params.mt_train_src, params.mt_train_dst)
        devset = MTDataset(params.mt_dev_src, params.mt_dev_dst)
        if params.mt_test_src and params.mt_test_dst:
            testset = MTDataset(params.mt_test_src, params.mt_test_dst)
        else:
            testset = None

        config = NMTConfig(params.config)
        sys.stdout.write("--SOURCE--\n")
        sys.stdout.flush()
        src_enc = Encodings()
        src_enc.compute(trainset.to_conll_dataset('src'),
                        devset.to_conll_dataset('src'),
                        word_cutoff=5)
        sys.stdout.write("--DESTINATION--\n")
        sys.stdout.flush()
        dst_enc = Encodings()
        dst_enc.compute(trainset.to_conll_dataset('dst'),
                        devset.to_conll_dataset('dst'),
                        word_cutoff=5)
        sys.stdout.write("Reading source embeddings\n")
        src_we = WordEmbeddings()
        src_we.read_from_file(params.mt_source_embeddings,
                              'label',
                              full_load=False)
        sys.stdout.write("Reading destination embeddings\n")
        dst_we = WordEmbeddings()
        dst_we.read_from_file(params.mt_destination_embeddings,
                              'label',
                              full_load=False)
        nmt = BRNNMT(src_we, dst_we, src_enc, dst_enc, config)
        trainer = MTTrainer(nmt,
                            src_enc,
                            dst_enc,
                            src_we,
                            dst_we,
                            params.itters,
                            trainset,
                            devset,
                            testset=testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    if params.train == "tagger":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = TaggerConfig(params.config)
        if not config._valid:
            return

        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        tagger = BDRNNTagger(config,
                             encodings,
                             embeddings,
                             aux_softmax_weight=params.aux_softmax_weight)
        trainer = TaggerTrainer(tagger, encodings, params.itters, trainset,
                                devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "parser":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = ParserConfig(params.config)
        if not config._valid:
            return
        # PARAM INJECTION
        if params.params != None:
            parts = params.params.split(":")
            for param in parts:
                variable = param.split("=")[0]
                value = param[len(variable) + 1:]
                print("External param injection: " + variable + "=" + value)
                exec("config.__dict__[\"" + variable + "\"] = " + value)
                # END INJECTION
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        parser = BDRNNParser(config,
                             encodings,
                             embeddings,
                             aux_softmax_weight=params.aux_softmax_weight)
        trainer = ParserTrainer(parser, encodings, params.itters, trainset,
                                devset, testset)
        trainer.start_training(params.output_base, params.batch_size)

    elif params.train == "lemmatizer":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = LemmatizerConfig(params.config)
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)

        embeddings = None
        lemmatizer = FSTLemmatizer(config, encodings, embeddings)
        trainer = LemmatizerTrainer(lemmatizer, encodings, params.itters,
                                    trainset, devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "compound":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "DEV FILE: " + params.dev_file
        if params.test_file is not None:
            print "TEST FILE: " + params.test_file
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "AUX SOFTMAX WEIGHT: " + str(params.aux_softmax_weight)
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"

        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        config = CompoundWordConfig(params.config)
        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)

        embeddings = None
        expander = CompoundWordExpander(config, encodings, embeddings)
        trainer = CompoundWordTrainer(expander, encodings, params.itters,
                                      trainset, devset, testset)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)

    elif params.train == "tokenizer":
        print "Starting training for " + params.train
        print "==PARAMETERS=="
        print "TRAIN FILE: " + params.train_file
        print "RAW TRAIN FILE: " + (params.raw_train_file if params.
                                    raw_train_file is not None else "n/a")
        print "DEV FILE: " + params.dev_file
        print "RAW DEV FILE: " + (params.raw_dev_file if params.raw_dev_file
                                  is not None else "n/a")
        print "TEST FILE: " + (params.test_file
                               if params.test_file is not None else "n/a")
        print "RAW TEST FILE: " + (params.raw_test_file if params.raw_test_file
                                   is not None else "n/a")
        print "EMBEDDINGS FILE: " + params.embeddings
        print "STOPPING CONDITION: " + str(params.itters)
        print "OUTPUT BASE: " + params.output_base
        print "CONFIG FILE: " + str(params.config)
        print "==============\n"
        trainset = Dataset(params.train_file)
        devset = Dataset(params.dev_file)
        if params.test_file:
            testset = Dataset(params.test_file)
        else:
            testset = None
        from generic_networks.tokenizers import TieredTokenizer
        config = TieredTokenizerConfig(params.config)
        config.raw_test_file = params.raw_test_file
        config.base = params.output_base
        config.patience = params.itters
        if not config._valid:
            return

        encodings = Encodings()
        encodings.compute(trainset, devset, 'label')
        # update wordlist if testset was provided
        if params.test_file:
            encodings.update_wordlist(testset)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(
            params.embeddings, None
        )  # setting wordlist to None triggers Word Embeddings to act as cache-only and load offsets for all words
        tokenizer = TieredTokenizer(config, encodings, embeddings)
        trainer = TokenizerTrainer(tokenizer,
                                   encodings,
                                   params.itters,
                                   trainset,
                                   devset,
                                   testset,
                                   raw_train_file=params.raw_train_file,
                                   raw_dev_file=params.raw_dev_file,
                                   raw_test_file=params.raw_test_file,
                                   gold_train_file=params.train_file,
                                   gold_dev_file=params.dev_file,
                                   gold_test_file=params.test_file)
        trainer.start_training(params.output_base,
                               batch_size=params.batch_size)