Beispiel #1
0
def ner_eval(h_params: HParams, estimator, processor):
    label_list = processor.get_labels()
    tokenizer = get_tokenizer(h_params)

    eval_examples = processor.get_dev_examples(h_params.data_dir)
    eval_file = os.path.join(h_params.output_dir, "eval.tf_record")
    filed_based_convert_examples_to_features(eval_examples, label_list,
                                             h_params.max_seq_length,
                                             tokenizer, eval_file, h_params)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d", len(eval_examples))
    tf.logging.info("  Batch size = %d", h_params.eval_batch_size)
    eval_steps = None
    if h_params.use_tpu:
        eval_steps = int(len(eval_examples) / h_params.eval_batch_size)
    eval_drop_remainder = h_params.use_tpu
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=h_params.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder)
    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
    output_eval_file = os.path.join(h_params.local_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        tf.logging.info("***** Eval results *****")
        for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
Beispiel #2
0
def ner_train(h_params: HParams, estimator, processor):
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if h_params.do_train:
        train_examples = processor.get_train_examples(h_params.data_dir)
        num_train_steps = h_params.num_train_steps
        num_warmup_steps = int(num_train_steps * h_params.warmup_proportion)

    tf.gfile.MakeDirs(h_params.output_dir)
    label_list = processor.get_labels()
    tokenizer = get_tokenizer(h_params)
    if h_params.do_train:
        train_file = os.path.join(h_params.output_dir, "train.tf_record")
        filed_based_convert_examples_to_features(train_examples, label_list,
                                                 h_params.max_seq_length,
                                                 tokenizer, train_file,
                                                 h_params)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", h_params.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=h_params.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
Beispiel #3
0
def test_convert_examples_to_features():
    hparams = get_debug_hparams()
    data_filename = get_data_filename(hparams)
    examples = get_examples(data_filename, SetType.train)
    features = convert_examples_to_features(
        examples=examples,
        label_list=get_unique_intents(data_filename),
        max_seq_length=hparams.max_seq_length,
        tokenizer=get_tokenizer(hparams))
    print(
        examples[2].__dict__
    )  # {'guid': 'train-2', 'text_a': 'Is it worth upgrading from 12.04 LTS to 13.04', 'text_b': None, 'label': 'Make Update'}
    print(
        features[2].__dict__
    )  # {'input_ids': [101, 2003, 2009, 4276, 25925, 2013, 2260, 1012, 5840, 8318, 2015, 2000, 2410, 1012, 5840, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'segment_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label_id': 1}
Beispiel #4
0
def ner_pred(h_params: HParams, estimator, processor):
    h_params = h_params._replace(use_tpu=False)
    label_list = processor.get_labels()
    tokenizer = get_tokenizer(h_params)

    token_path = os.path.join(h_params.local_dir, "token_test.txt")
    with open(os.path.join(h_params.local_dir, 'label2id.pkl'), 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}
    if os.path.exists(token_path):
        os.remove(token_path)
    predict_examples = processor.get_test_examples(h_params.data_dir)

    predict_file = os.path.join(h_params.output_dir, "predict.tf_record")
    filed_based_convert_examples_to_features(predict_examples,
                                             label_list,
                                             h_params.max_seq_length,
                                             tokenizer,
                                             predict_file,
                                             h_params,
                                             mode="test")

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d", len(predict_examples))
    tf.logging.info("  Batch size = %d", h_params.predict_batch_size)

    if h_params.use_tpu:
        # Warning: According to tpu_estimator.py Prediction on TPU is an
        # experimental feature and hence not supported here
        tf.logging.warning("Prediction in TPU not supported")

    predict_drop_remainder = h_params.use_tpu
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=h_params.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn)

    result = list(result)
    result = [pred['predictions'] for pred in result]
    return result
Beispiel #5
0
def predict(hparams: HParams) -> List[str]:
    from improv.my_classifier import get_model_fn_and_estimator, file_based_input_fn_builder
    from improv.run_classifier import file_based_convert_examples_to_features
    import os
    from typing import Iterable
    import numpy as np
    from improv.utils import convert_result_pred, get_rounded_f1

    data_filename = hparams.data_dir.parent / (hparams.task_name + '.tsv')
    params = hparams._replace(
        use_tpu=False)  # BERT code warns against using TPU for predictions.
    model_fn, estimator = get_model_fn_and_estimator(params)

    predict_examples = get_examples(data_filename, SetType.test)
    predict_file = os.path.join(params.output_dir, "predict.tf_record")
    file_based_convert_examples_to_features(predict_examples,
                                            get_unique_intents(data_filename),
                                            params.max_seq_length,
                                            get_tokenizer(params),
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d", len(predict_examples))
    tf.logging.info("  Batch size = %d", params.predict_batch_size)

    predict_drop_remainder = params.use_tpu
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=params.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result: Iterable[np.ndarray] = estimator.predict(input_fn=predict_input_fn)
    label_list = get_intents(
        data_filename)  # used for label_list[max_class] this might be wrong
    y_pred = convert_result_pred(result, label_list)
    print('f1 score: {}'.format(
        get_rounded_f1(params.data_dir.parent / 'askubuntu.tsv',
                       y_pred,
                       average='micro')))
    return y_pred
Beispiel #6
0
def train_and_evaluate(hparams: HParams):
    tf.logging.set_verbosity(tf.logging.INFO)

    data_filename = hparams.data_dir.parent.parent / (hparams.task_name +
                                                      '.tsv')
    train_examples = get_examples(data_filename, SetType.train)
    num_train_steps = int(
        len(train_examples) / hparams.train_batch_size *
        hparams.num_train_epochs)
    steps_per_epoch = len(train_examples) // hparams.train_batch_size
    max_steps = hparams.num_train_epochs * steps_per_epoch
    tf.logging.info('train_batch_size=%d  eval_batch_size=%d  max_steps=%d',
                    hparams.train_batch_size, hparams.eval_batch_size,
                    max_steps)

    # TPU change 3
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        hparams.tpu_name, zone=hparams.tpu_zone, project=hparams.gcp_project)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=hparams.output_dir,  # according to my_classifier
        save_checkpoints_steps=hparams.save_checkpoints_steps,
        save_summary_steps=hparams.save_summary_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=hparams.save_checkpoints_steps,
            per_host_input_for_training=True))

    model_fn = model_fn_builder(
        bert_config=BertConfig.from_json_file(str(hparams.bert_config_file)),
        num_labels=len(get_unique_intents(data_filename)),
        init_checkpoint=str(hparams.init_checkpoint),
        learning_rate=hparams.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=int(num_train_steps * hparams.warmup_proportion),
        use_tpu=hparams.use_tpu,
        use_one_hot_embeddings=True)

    estimator = tf.contrib.tpu.TPUEstimator(  # TPU change 4
        model_fn=model_fn,
        config=config,
        # params=hparams,
        # model_dir=hparams.data_dir,
        train_batch_size=hparams.train_batch_size,
        eval_batch_size=hparams.eval_batch_size,
        use_tpu=hparams.use_tpu)

    train_features = convert_examples_to_features(
        train_examples, get_unique_intents(data_filename),
        hparams.max_seq_length, get_tokenizer(hparams))
    tf.logging.info("  Num steps = %d", num_train_steps)

    train_input_fn = input_fn_builder(features=train_features,
                                      seq_length=hparams.max_seq_length,
                                      is_training=True,
                                      drop_remainder=True,
                                      use_tpu=hparams.use_tpu)

    eval_examples = get_examples(data_filename, SetType.dev)
    eval_features = convert_examples_to_features(
        eval_examples, get_unique_intents(data_filename),
        hparams.max_seq_length, get_tokenizer(hparams))

    # Eval will be slightly WRONG on the TPU because it will truncate
    # the last batch.
    eval_steps = int(len(eval_examples) / hparams.eval_batch_size)
    eval_input_fn = input_fn_builder(features=eval_features,
                                     seq_length=hparams.max_seq_length,
                                     is_training=False,
                                     drop_remainder=True,
                                     use_tpu=hparams.use_tpu)

    # set up training and evaluation in a loop
    # def input_fn_builder(features, seq_length, is_training, drop_remainder, use_tpu):

    # load last checkpoint and start from there
    current_step = load_global_step_from_checkpoint_dir(hparams.output_dir)
    tf.logging.info(
        'Training for %d steps (%.2f epochs in total). Current'
        ' step %d.', max_steps, max_steps / steps_per_epoch, current_step)

    start_timestamp = time.time()  # This time will include compilation time

    while current_step < max_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + hparams.iterations_per_loop,
                              max_steps)  # possibly need to save checkpoints

        if hparams.do_train:
            estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)

        current_step = next_checkpoint
        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        next_checkpoint, int(time.time() - start_timestamp))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info('Starting to evaluate at step %d', next_checkpoint)
        eval_results = estimator.evaluate(input_fn=eval_input_fn,
                                          steps=eval_steps)
        tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                        eval_results)

    elapsed_time = int(time.time() - start_timestamp)
    tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                    max_steps, elapsed_time)