Ejemplo n.º 1
0
    def train_parser(cls, options, data_train=None, data_dev=None, data_test=None):
        if sys.platform.startswith("linux"):
            set_proc_name(options.title)
        ensure_dir(options.output)
        path = os.path.join(options.output, "{}_{}_train.log".format(options.title,
                                                                     int(time.time())))
        log_to_file(path)
        logger.name = options.title
        cls.options_hook(options)
        DataFormatClass = cls.get_data_formats()[options.data_format]

        if data_train is None:
            data_train = DataFormatClass.from_file(options.conll_train)

        if data_dev is None:
            data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev}

        if data_test is None and options.conll_test is not None:
            data_test = DataFormatClass.from_file(options.conll_test, False)
        else:
            data_test = None

        try:
            os.makedirs(options.output)
        except OSError:
            pass

        return cls.repeat_train_and_validate(data_train, data_dev, data_test, options)
Ejemplo n.º 2
0
    def train_parser(cls, options, data_train=None, data_dev=None, data_test=None):
        set_proc_name(options.title)
        ensure_dir(options.output)
        path = os.path.join(options.output, "{}_{}_train.log".format(options.title,
                                                                     int(time.time())))
        log_to_file(path)
        logger.name = options.title
        cls.options_hook(options)
        DataFormatClass = cls.get_data_formats()[options.data_format]

        if data_train is None:
            data_train = DataFormatClass.from_file(options.conll_train)

        if data_dev is None:
            data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev}

        try:
            os.makedirs(options.output)
        except OSError:
            pass

        parser = cls(options, data_train)
        random_obj = random.Random(1)
        for epoch in range(options.epochs):
            logger.info('Starting epoch %d', epoch)
            random_obj.shuffle(data_train)
            options.is_train = True
            parser.train(data_train)

            # save model and delete old model
            for i in range(0, epoch - options.max_save):
                path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1)
                if os.path.exists(path):
                    os.remove(path)
            path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1)
            parser.save(path)

            def predict(sentences, gold_file, output_file):
                options.is_train = False
                with open(output_file, "w") as f_output:
                    if hasattr(DataFormatClass, "file_header"):
                        f_output.write(DataFormatClass.file_header + "\n")
                    for i in parser.predict(sentences):
                        f_output.write(i.to_string())
                # script_path = os.path.join(os.path.dirname(__file__), "main.py")
                # p = subprocess.Popen([sys.executable, script_path, "mst+empty", "predict", "--model", path,
                #                       "--test", gold_file,
                #                       "--output", output_file], stdout=sys.stdout)
                # p.wait()
                DataFormatClass.evaluate_with_external_program(gold_file, output_file)

            for file_name, file_content in data_dev.items():
                try:
                    prefix, suffix = os.path.basename(file_name).rsplit(".", 1)
                except ValueError:
                    prefix = os.path.basename(file_name)
                    suffix = ""

                dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch + 1, suffix))
                predict(file_content, file_name, dev_output)
Ejemplo n.º 3
0
    def train_parser(cls, options, data_train=None, data_dev=None, data_test=None):
        set_proc_name(options.title)
        ensure_dir(options.output)
        path = os.path.join(options.output, "{}_{}_train.log".format(options.title,
                                                                     int(time.time())))
        log_to_file(path)
        logger.name = options.title

        logger.info('Options:\n%s', pformat(options.__dict__))
        if data_train is None:
            data_train = cls.DataType.from_file(options.conll_train)

        if data_dev is None:
            data_dev = {i: cls.DataType.from_file(i, False) for i in options.conll_dev}

        try:
            os.makedirs(options.output)
        except OSError:
            pass

        parser = cls(options, data_train)
        random_obj = random.Random(1)

        def do_predict(epoch):
            for file_name, dev_sentences in data_dev.items():
                try:
                    prefix, suffix = os.path.basename(file_name).rsplit(".", 1)
                except ValueError:
                    prefix = file_name
                    suffix = ""

                dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch, suffix))
                cls.predict_and_output(parser, options, dev_sentences, dev_output)

        if options.epochs == 0:
            print("Predict directly.")
            do_predict(0)

        for epoch in range(options.epochs):
            logger.info('Starting epoch %d', epoch)
            random_obj.shuffle(data_train)
            parser.train(data_train)

            # save model and delete old model
            for i in range(0, epoch - options.max_save):
                path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1)
                if os.path.exists(path):
                    os.remove(path)
            path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1)
            parser.save(path)
            do_predict(epoch)
Ejemplo n.º 4
0
def train_parser(options,
                 sentences_train=None,
                 sentences_dev=None,
                 sentences_test=None):
    current_path = os.path.dirname(__file__)
    set_proc_name(options.title)
    if not (options.rlFlag or options.rlMostFlag or options.headFlag):
        print(
            'You must use either --userlmost or --userl or --usehead (you can use multiple)'
        )
        sys.exit()

    if not sentences_train:
        sentences_train = get_sentences(options.conll_train)
    if not sentences_dev:
        sentences_dev = get_sentences(options.conll_dev) \
            if options.conll_dev is not None else None
    if not sentences_test:
        sentences_test = get_sentences(options.conll_test) \
            if options.conll_test is not None else None

    print('Preparing vocab')
    words, w2i, pos, rels = tree_utils.vocab(sentences_train)
    if not os.path.exists(options.output):
        os.mkdir(options.output)
    with open(os.path.join(options.output, options.params), 'wb') as paramsfp:
        pickle.dump((words, w2i, pos, rels, options), paramsfp)
    print('Finished collecting vocab')
    print('Initializing blstm arc hybrid:')
    parser = ArcHybridLSTM(words, pos, rels, w2i, options)
    for epoch in range(options.epochs):
        print('Starting epoch', epoch)
        parser.Train(sentences_train)

        def predict(sentences, gold_file, output_file):

            with open(output_file, "w") as f:
                result = parser.Predict(sentences)
                for i in result:
                    f.write(i.to_string())

            eval_script = os.path.join(
                current_path, "utils/evaluation_script/conll17_ud_eval.py")
            weight_file = os.path.join(current_path,
                                       "utils/evaluation_script/weights.clas")
            eval_process = sh.python(eval_script,
                                     "-v",
                                     "-w",
                                     weight_file,
                                     gold_file,
                                     output_file,
                                     _out=output_file + '.txt')
            eval_process.wait()
            sh.cat(output_file + '.txt', _out=sys.stdout)

            print('Finished predicting {}'.format(gold_file))

        if sentences_dev:
            dev_output = os.path.join(
                options.output, 'dev_epoch_' + str(epoch + 1) + '.conllu')
            predict(sentences_dev, options.conll_dev, dev_output)

        if sentences_test:
            test_output = os.path.join(
                options.output, 'test_epoch_' + str(epoch + 1) + '.conllu')
            predict(sentences_test, options.conll_test, test_output)

        for i in range(epoch + 1 - options.max_model):
            filename = os.path.join(options.output, options.model + str(i))
            if os.path.exists(filename):
                os.remove(filename)
        parser.Save(
            os.path.join(options.output, options.model + str(epoch + 1)))
Ejemplo n.º 5
0
        for i in range(epoch + 1 - options.max_model):
            filename = os.path.join(options.output, options.model + str(i))
            if os.path.exists(filename):
                os.remove(filename)
        parser.Save(
            os.path.join(options.output, options.model + str(epoch + 1)))


if __name__ == '__main__':
    parser = get_parser()
    (options, args) = parser.parse_args()
    print('Using external embedding:', options.external_embedding)

    current_path = os.path.dirname(__file__)
    set_proc_name(options.title)

    if not options.predictFlag:
        train_parser(options)
    else:
        with open(options.params, 'r') as paramsfp:
            words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)

        stored_opt.external_embedding = options.external_embedding

        parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt)
        parser.Load(options.model)
        conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu')
        tespath = os.path.join(
            options.output,
            'test_pred.conll' if not conllu else 'test_pred.conllu')