Esempio n. 1
0
def do_training(args):
    torch.manual_seed(133)
    # Set up configuration and output
    config = Config(args)
    if not os.path.exists(config.output_path):
        os.makedirs(config.output_path)

    # Set up logging
    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(
        logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    # Load data
    helper, data = load_data(args)
    train_examples = data['train_examples']
    dev_examples = data['dev_examples']
    helper.save(config.output_path)

    # Load embeddings
    embeddings = load_embeddings(args, helper, config.device)

    # Initialize model
    logger.info("Initializing model...", )
    model = NerBiLstmModel(helper, config, embeddings)
    model.to(config.device)

    # Preprocess data
    data_preprocessor = DataPreprocessor(model, config, helper)
    train_examples = data_preprocessor.preprocess_sequence_data(train_examples)
    dev_examples = data_preprocessor.preprocess_sequence_data(dev_examples)

    # Start training
    trainer = Trainer(model, config, helper, logger)
    logger.info("Starting training...", )
    trainer.train(train_examples, dev_examples)

    # Save predictions of the best model
    logger.info(
        "Training completed, saving predictions of the best model...", )
    with torch.no_grad():
        model.load_state_dict(torch.load(config.model_output))
        model.eval()
        predictor = Predictor(model, config)
        output = predictor.predict(dev_examples, use_str_labels=True)
        sentences, labels, predictions = zip(*output)
        predictions = [[LBLS[l] for l in preds] for preds in predictions]
        output = list(zip(sentences, labels, predictions))

        with open(model.config.conll_output, 'w') as f:
            write_conll(f, output)
        with open(model.config.eval_output, 'w') as f:
            for sentence, labels, predictions in output:
                print_sentence(f, sentence, labels, predictions)
Esempio n. 2
0
    def infer(self, conll):
        """
        Uses the JAVANLP sentence object to create an appropriate CoNLL formatted input for the CRF
        CONLL is a list of arrays.
        @param: conll is a set of strings.
        """
        with open(self.test_path, "w") as f:
            for conll_ in conll:
                write_conll(f, conll_)
                f.write("\n")

        output = check_output([CRF_TEST, "-m", self.model_path, self.test_path], universal_newlines=True)
        conll_out = read_conll_doc(output)
        assert len(conll_out) == len(conll)
        tags = [[tok[-1] for tok in c] for c in conll_out]
        return tags
Esempio n. 3
0
def do_train(args):
    # Set up some parameters.
    config = Config(args)
    helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args)
    embeddings = load_embeddings(args, helper)
    #print("**********************Dumping embeddings to a file.***************")
    #with open("F:/Jupyter/NLP/assignment3/embeddings.pkl","wb") as f:
    #pickle.dump(embeddings, f)
    config.embed_size = embeddings.shape[1]
    helper.save(config.output_path)

    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(
        logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    report = None  #Report(Config.eval_output)

    with tf.Graph().as_default():
        logger.info("Building model...", )
        start = time.time()
        model = RNNModel(helper, config, embeddings)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            model.fit(session, saver, train, dev)
            if report:
                report.log_output(model.output(session, dev_raw))
                report.save()
            else:
                # Save predictions in a text file.
                output = model.output(session, dev_raw)
                sentences, labels, predictions = zip(*output)
                predictions = [[LBLS[l] for l in preds]
                               for preds in predictions]
                output = zip(sentences, labels, predictions)

                with open(model.config.conll_output, 'w') as f:
                    write_conll(f, output)
                with open(model.config.eval_output, 'w') as f:
                    for sentence, labels, predictions in output:
                        print_sentence(f, sentence, labels, predictions)
Esempio n. 4
0
def do_train(args):
    # Set up some parameters.
    config = Config()
    helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]
    helper.save(config.output_path)

    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(
        logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    report = None  #Report(Config.eval_output)

    with tf.Graph().as_default():
        logger.info("Building model...", )
        start = time.time()
        model = WindowModel(helper, config, embeddings)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            #session = tf_debug.LocalCLIDebugWrapperSession(session)
            #session.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
            session.run(init)
            model.fit(session, saver, train, dev)
            if report:
                report.log_output(model.output(session, dev_raw))
                report.save()
            else:
                # Save predictions in a text file.
                output = model.output(session, dev_raw)
                sentences, labels, predictions = zip(*output)
                predictions = [[LBLS[l] for l in preds]
                               for preds in predictions]
                output = zip(sentences, labels, predictions)

                with open(model.config.conll_output, 'w') as f:
                    write_conll(f, output)
                with open(model.config.eval_output, 'w') as f:
                    for sentence, labels, predictions in output:
                        print_sentence(f, sentence, labels, predictions)
Esempio n. 5
0
    def update(self, conll, tags):
        """
        Updates labels for the current example.
        """
        # Create labelled data
        conll_labelled = [feats[:self.TAG_LABEL] + [t] for feats, t in zip(conll, tags)]

        # If we've move previous, rewrite the whole labelled set.
        if self.cur_index <= len(self.labelled_data):
            self.labelled_data[self.cur_index-1] = conll_labelled
            self.labelled_data_file.close()
            self.labelled_data_file = open(self.train_path,'w')
            for conll in self.labelled_data:
                write_conll(self.labelled_data_file, conll)
            self.labelled_data_file.close()
            self.labelled_data_file = open(self.train_path,'a')
        else:
            self.labelled_data.append(conll_labelled)
            write_conll(self.labelled_data_file, conll_labelled)
Esempio n. 6
0
def do_train(args):
    # Set up some parameters.
    config = Config()
    helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]
    helper.save(config.output_path)

    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    report = None  # Report(Config.eval_output)

    with tf.Graph().as_default():
        logger.info("Building model...", )
        start = time.time()
        model = WindowModel(helper, config, embeddings)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            model.fit(session, saver, train, dev)
            if report:
                report.log_output(model.output(session, dev_raw))
                report.save()
            else:
                # Save predictions in a text file.
                output = model.output(session, dev_raw)
                sentences, labels, predictions = zip(*output)
                predictions = [[LBLS[l] for l in preds] for preds in predictions]
                output = zip(sentences, labels, predictions)

                with open(model.config.conll_output, 'w') as f:
                    write_conll(f, output)
                with open(model.config.eval_output, 'w') as f:
                    for sentence, labels, predictions in output:
                        print_sentence(f, sentence, labels, predictions)
Esempio n. 7
0
def do_train(args):
    # Set up some parameters.
    config = Config()
    helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]
    helper.save(config.output_path)

    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(
        logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    report = None  #Report(Config.eval_output)

    logger.info("Building model...", )
    start = time.time()
    model = WindowModel(helper, config, embeddings)
    logger.info("took %.2f seconds", time.time() - start)
    model.apply(init_weights)

    model.fit(train, dev)
    if report:
        report.log_output(model.output(dev_raw))
        report.save()
    else:
        # Save predictions in a text file.
        output = model.output(dev_raw)
        sentences, labels, predictions = zip(*output)
        predictions = [[LBLS[l] for l in preds] for preds in predictions]
        output = zip(sentences, labels, predictions)

        with open(model.config.conll_output, 'w') as f:
            write_conll(f, output)
        with open(model.config.eval_output, 'w') as f:
            for sentence, labels, predictions in output:
                print_sentence(f, sentence, labels, predictions)