Esempio n. 1
0
import csv
import collections
from texar_repo.examples.bert.utils import data_utils, model_utils, tokenization
import importlib
import tensorflow as tf
import texar as tx
from texar_repo.examples.bert import config_classifier as config_downstream
from texar_repo.texar.utils import transformer_utils
from texar_repo.examples.transformer.utils import data_utils, utils
from texar_repo.examples.transformer.bleu_tool import bleu_wrapper

train_dataset = file_based_input_fn_builder(
    input_file=train_out_file,
    max_seq_length_src=max_seq_length_src,
    max_seq_length_tgt=max_seq_length_tgt,
    is_training=True,
    drop_remainder=True,
    is_distributed=is_distributed)({
        'batch_size': batch_size
    })

eval_dataset = file_based_input_fn_builder(
    input_file=eval_out_file,
    max_seq_length_src=max_seq_length_src,
    max_seq_length_tgt=max_seq_length_tgt,
    is_training=True,
    drop_remainder=True,
    is_distributed=is_distributed)({
        'batch_size': eval_batch_size
    })
Esempio n. 2
0
def main(_):
    logging.set_verbosity(logging.INFO)
    processors = {"ner": pr.NerProcessor}
    bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)
    processor = processors['ner']()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=False)
    run_config = tf.estimator.RunConfig(
        model_dir=args.output_dir,
        save_checkpoints_steps=args.save_checkpoints_steps)
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if args.process == 'train':
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.batch_size * args.epoch)
        num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=args.init_checkpoint,
                                learning_rate=args.lr,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False)
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if args.process == 'train':
        train_file = os.path.join(args.output_dir, "train.tf_record")
        _, _ = pr.filed_based_convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer,
            train_file)
        logging.info("***** Running training *****")
        logging.info("  Num examples = %d", len(train_examples))
        logging.info("  Batch size = %d", args.batch_size)
        logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = pr.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=args.max_seq_length,
            is_training=True,
            drop_remainder=True,
            batch_size=args.batch_size)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    if args.process == "eval":
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_file = os.path.join(args.output_dir, "eval.tf_record")
        batch_tokens, batch_labels = pr.filed_based_convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer,
            eval_file)

        logging.info("***** Running evaluation *****")
        logging.info("  Num examples = %d", len(eval_examples))
        logging.info("  Batch size = %d", args.batch_size)
        eval_input_fn = pr.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=False,
            batch_size=args.batch_size)
        result = estimator.evaluate(input_fn=eval_input_fn)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as wf:
            logging.info("***** Eval results *****")
            f = result["acc"]
            r = result["rec"]
            p = result["pre"]
            logging.info("***********************************************")
            logging.info("********************P = %s*********************",
                         str(p))
            logging.info("********************R = %s*********************",
                         str(r))
            logging.info("********************F = %s*********************",
                         str(f))
            logging.info("***********************************************")
    if args.process == "predict":
        with open(args.middle_output + '/label2id.pkl', 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(args.data_dir)

        predict_file = os.path.join(args.output_dir, "predict.tf_record")
        batch_tokens, batch_labels = pr.filed_based_convert_examples_to_features(
            predict_examples, label_list, args.max_seq_length, tokenizer,
            predict_file)

        logging.info("***** Running prediction*****")
        logging.info("  Num examples = %d", len(predict_examples))
        logging.info("  Batch size = %d", args.batch_size)

        predict_input_fn = pr.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=args.max_seq_length,
            is_training=False,
            drop_remainder=False,
            batch_size=args.batch_size)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(args.output_dir, "label_test.txt")
        Writer(output_predict_file, result, batch_tokens, batch_labels,
               id2label)