Ejemplo n.º 1
0
    def my_get_estimator(self):
        tpu_cluster_resolver = None
        if self.flags.use_tpu and self.flags.tpu_name:
            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                self.flags.tpu_name, zone=self.flags.tpu_zone, project=self.flags.gcp_project)

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=self.flags.master,
            model_dir=self.flags.output_dir,
            save_checkpoints_steps=self.flags.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=self.flags.iterations_per_loop,
                num_shards=self.flags.num_tpu_cores,
                per_host_input_for_training=is_per_host))

        model_fn = run_squad.model_fn_builder(
            bert_config=self.bert_config,
            init_checkpoint=self.flags.init_checkpoint,
            learning_rate=self.flags.learning_rate,
            num_train_steps=None,
            num_warmup_steps=None,
            use_tpu=self.flags.use_tpu,
            use_one_hot_embeddings=self.flags.use_tpu)

        # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
        return tf.contrib.tpu.TPUEstimator(
            use_tpu=self.flags.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=self.flags.train_batch_size,
            predict_batch_size=self.flags.predict_batch_size)
    def init_bert(self):
        bert_config = modeling.BertConfig.from_json_file(
            run_squad.FLAGS.bert_config_file)
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=run_squad.FLAGS.vocab_file,
            do_lower_case=run_squad.FLAGS.do_lower_case)

        tf.logging.set_verbosity(tf.logging.DEBUG)

        num_train_steps = None
        num_warmup_steps = None

        model_fn = run_squad.model_fn_builder(
            bert_config=bert_config,
            init_checkpoint=run_squad.FLAGS.init_checkpoint,
            learning_rate=run_squad.FLAGS.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=run_squad.FLAGS.use_tpu,
            use_one_hot_embeddings=run_squad.FLAGS.use_tpu)

        tpu_cluster_resolver = None
        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=run_squad.FLAGS.master,
            model_dir=run_squad.FLAGS.output_dir,
            save_checkpoints_steps=run_squad.FLAGS.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=run_squad.FLAGS.iterations_per_loop,
                num_shards=run_squad.FLAGS.num_tpu_cores,
                per_host_input_for_training=is_per_host))

        # If TPU is not available, this will fall back to normal Estimator on CPU
        # or GPU.
        self.estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=run_squad.FLAGS.train_batch_size,
            predict_batch_size=run_squad.FLAGS.predict_batch_size)
Ejemplo n.º 3
0
    def __init__(self, model):
        # Define required variables

        if (model == 'en'):
            self.model = 'en'
            bert_config_file = './model/base/bert_config.json'
            vocab_file = './model/base/vocab.txt'
            init_checkpoint = './model/base/bert_model.ckpt'
            self.output_dir = './finetuned/eng'
        elif (model == 'vi'):
            self.model = 'vi'
            bert_config_file = './model/multi_cased/bert_config.json'
            vocab_file = './model/multi_cased/vocab.txt'
            init_checkpoint = './model/multi_cased/bert_model.ckpt'
            self.output_dir = './finetuned/vi'
        elif (model == 'vi_uit'):
            self.model = 'vi'
            bert_config_file = './model/multi_cased/bert_config.json'
            vocab_file = './model/multi_cased/vocab.txt'
            init_checkpoint = './model/multi_cased/bert_model.ckpt'
            self.output_dir = './finetuned/vi_uit'

        # Set up model for prediction
        tf.logging.set_verbosity(tf.logging.INFO)
        bert_config = modeling.BertConfig.from_json_file(bert_config_file)
        self.validate_flags_or_throw(bert_config)
        tf.gfile.MakeDirs(self.output_dir)

        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_file, do_lower_case=FLAGS.do_lower_case)

        tpu_cluster_resolver = None
        if FLAGS.use_tpu and FLAGS.tpu_name:
            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=FLAGS.master,
            model_dir=self.output_dir,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=FLAGS.iterations_per_loop,
                num_shards=FLAGS.num_tpu_cores,
                per_host_input_for_training=is_per_host))

        model_fn = model_fn_builder(bert_config=bert_config,
                                    init_checkpoint=init_checkpoint,
                                    learning_rate=FLAGS.learning_rate,
                                    num_train_steps=None,
                                    num_warmup_steps=None,
                                    use_tpu=FLAGS.use_tpu,
                                    use_one_hot_embeddings=FLAGS.use_tpu)

        # If TPU is not available, this will fall back to normal Estimator on CPU
        # or GPU.
        self.estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)
Ejemplo n.º 4
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  bert_config = rs.modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  rs.validate_flags_or_throw(bert_config)

  tf.gfile.MakeDirs(FLAGS.output_dir)

  tokenizer = rs.tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train:
    train_examples = rs.read_squad_examples(
        input_file=FLAGS.train_file, is_training=True)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # Pre-shuffle the input to avoid having to make a very large shuffle
    # buffer in in the `input_fn`.
    rng = random.Random(12345)
    rng.shuffle(train_examples)

  model_fn = rs.model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    # We write to a temporary file to avoid storing very large constant tensors
    # in memory.
    train_writer = rs.FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "train.tf_record"),
        is_training=True)
    rs.convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=True,
        output_fn=train_writer.process_feature)
    train_writer.close()

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num orig examples = %d", len(train_examples))
    tf.logging.info("  Num split examples = %d", train_writer.num_features)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    del train_examples

    train_input_fn = rs.input_fn_builder(
        input_file=train_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_predict:
    eval_examples = rs.read_squad_examples(
        input_file=FLAGS.predict_file, is_training=False)

    act_seq_len = get_act_seq_len(eval_examples, tokenizer, FLAGS.max_seq_length,
                    FLAGS.doc_stride, FLAGS.max_query_length)

    eval_writer = rs.FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "eval.tf_record"),
        is_training=False)
    eval_features = []

    def append_feature(feature):
      eval_features.append(feature)
      eval_writer.process_feature(feature)

    rs.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

    tf.logging.info("***** Running predictions *****")
    tf.logging.info("  Num orig examples = %d", len(eval_examples))
    tf.logging.info("  Num split examples = %d", len(eval_features))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    all_results = []

    predict_input_fn = rs.input_fn_builder(
        input_file=eval_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)
    
    # If running eval on the TPU, you will need to specify the number of
    # steps.
    all_results = []
    for idx, result in enumerate(estimator.predict(
        predict_input_fn, yield_single_examples=True)):
      if len(all_results) % 1000 == 0:
        tf.logging.info("Processing example: %d" % (len(all_results)))
      unique_id = int(result["unique_ids"])
      start_logits = [float(x) for x in result["start_logits"].flat]
      end_logits = [float(x) for x in result["end_logits"].flat]
      all_results.append(
          rs.RawResult(
              unique_id=unique_id,
              start_logits=start_logits[:act_seq_len[idx]],
              end_logits=end_logits[:act_seq_len[idx]]))

    output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json")
    output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json")

    rs.write_predictions(eval_examples, eval_features, all_results,
                         FLAGS.n_best_size, FLAGS.max_answer_length,
                         FLAGS.do_lower_case, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file)
Ejemplo n.º 5
0
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

run_config = tf.contrib.tpu.RunConfig(
    cluster=None,
    master=None,
    model_dir=MODEL_FT,
    save_checkpoints_steps=5000,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=1000,
        num_shards=8,
        per_host_input_for_training=is_per_host))

model_fn = model_fn_builder(bert_config=bert_config,
                            init_checkpoint=init_ckpt_file,
                            learning_rate=None,
                            num_train_steps=None,
                            num_warmup_steps=None,
                            use_tpu=False,
                            use_one_hot_embeddings=False)

estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False,
                                        model_fn=model_fn,
                                        config=run_config,
                                        train_batch_size=6,
                                        predict_batch_size=8)

estimator._export_to_tpu = False  ## !!important to add this

estimator.export_saved_model(
    export_dir_base=EXPORT_PATH,
    serving_input_receiver_fn=serving_input_receiver_fn)
Ejemplo n.º 6
0
def prev():
    def load_directory_data(directory):
        data = {}
        data['sentence'] = []
        data['sentiment'] = []
        for file_path in os.listdir(directory):
            with tf.gfile.GFile(os.path.join(directory, file_path), 'r') as f:
                data['sentence'].append(f.read())
                data['sentiment'].append(
                    re.match("\d+_(\d+)\.txt", file_path).group(1))
        return pd.DataFrame.from_dict(data)

    def load_dataset(directory):
        pos_df = load_directory_data(os.path.join(directory, 'pos'))
        neg_df = load_directory_data(os.path.join(directory, 'neg'))
        pos_df['polarity'] = 1
        neg_df['polarity'] = 0
        return pd.concat([pos_df,
                          neg_df]).sample(frac=1).reset_index(drop=True)

    def download_and_load_datasets(force_download=False):
        dataset = tf.keras.utils.get_file(
            fname='aclImdb.tar.gz',
            origin=
            "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
            extract=True)

        train_df = load_dataset(
            os.path.join(os.path.dirname(dataset), 'aclImdb', 'train'))
        test_df = load_dataset(
            os.path.join(os.path.dirname(dataset), 'aclImdb', 'test'))

        return train_df, test_df

    train, test = download_and_load_datasets()

    train = train.sample(5000)
    test = train.sample(5000)

    DATA_COLUMN = 'sentence'
    LABEL_COLUMN = 'polarity'
    label_list = [0, 1]

    train_InputExamples = train.apply(
        lambda x: bert.run_classifier.InputExample(guid=None,
                                                   text_a=x[DATA_COLUMN],
                                                   text_b=None,
                                                   label=x[LABEL_COLUMN]),
        axis=1)

    test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(
        guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]),
                                    axis=1)

    # this is a path to an uncased (all lowercase) version of BERT
    BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'

    def create_tokenizer_from_hub_module():
        with tf.Graph().as_default():
            bert_module = hub.Module(BERT_MODEL_HUB)
            tokenization_info = bert_module(signature='tokenization_info',
                                            as_dict=True)
            with tf.Session() as sess:
                vocab_file, do_lower_case = sess.run([
                    tokenization_info['vocab_file'],
                    tokenization_info['do_lower_case']
                ])

        return bert.tokenization.FullTokenizer(vocab_file=vocab_file,
                                               do_lower_case=do_lower_case)

    tokenizer = create_tokenizer_from_hub_module()

    MAX_SEQ_LENGTH = 128
    train_features = bert.run_classifier.convert_examples_to_features(
        train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
    test_features = bert.run_classifier.convert_examples_to_features(
        test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

    def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                     num_labels):
        bert_module = hub.Module(BERT_MODEL_HUB, trainable=True)

        bert_inputs = dict(input_ids=input_ids,
                           input_mask=input_mask,
                           segment_ids=segment_ids)

        bert_outputs = bert_module(inputs=bert_inputs,
                                   signature='tokens',
                                   as_dict=True)

        output_layer = bert_outputs['pooled_output']
        hidden_size = output_layer.shape[-1].value

        output_weights = tf.get_variable(
            'output_weights', [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable('output_bias', [num_labels],
                                      initializer=tf.zeros_initializer())
        with tf.variable_scope('loss'):
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            one_hot_labels = tf.one_hot(labels,
                                        depth=num_labels,
                                        dtype=tf.float32)

            predicted_labels = tf.squeeze(
                tf.argmax(log_probs, axis=-1, output_type=tf.int32))
            if is_predicting:
                return (predicted_labels, log_probs)

            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            loss = tf.reduce_mean(per_example_loss)
            return (loss, predicted_labels, log_probs)

    def model_fn_builder(num_labels, learning_rate, num_train_steps,
                         num_warmup_steps):
        def model_fn(features, labels, mode, params):
            input_ids = features['input_ids']
            input_mask = features['input_mask']
            segment_ids = features['segment_ids']
            label_ids = features['label_ids']

            is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

            if not is_predicting:
                (loss, predicted_labels,
                 log_probs) = create_model(is_predicting, input_ids,
                                           input_mask, segment_ids, label_ids,
                                           num_labels)
                train_op = bert.optimization.create_optimizer(loss,
                                                              learning_rate,
                                                              num_train_steps,
                                                              num_warmup_steps,
                                                              use_tpu=False)

                def metric_fn(label_ids, predicted_labels):
                    accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                    f1_score = tf.contrib.metrics.f1_score(
                        label_ids, predicted_labels)
                    auc = tf.metrics.auc(label_ids, predicted_labels)
                    recall = tf.metrics.recall(label_ids, predicted_labels)
                    precision = tf.metrics.precision(label_ids,
                                                     predicted_labels)
                    true_pos = tf.metrics.true_positives(
                        label_ids, predicted_labels)
                    true_neg = tf.metrics.true_negatives(
                        label_ids, predicted_labels)
                    false_pos = tf.metrics.false_positives(
                        label_ids, predicted_labels)
                    false_neg = tf.metrics.false_negatives(
                        label_ids, predicted_labels)
                    return {
                        'eval_accuracy': accuracy,
                        'f1_score': f1_score,
                        'auc': auc,
                        'precision': precision,
                        'recall': recall,
                        'true_positives': true_pos,
                        'true_negatives': true_neg,
                        'false_positives': false_pos,
                        'false_negatives': false_neg
                    }

                eval_metrics = metric_fn(label_ids, predicted_labels)

                if mode == tf.estimator.ModeKeys.TRAIN:
                    return tf.estimator.EstimatorSpec(mode=mode,
                                                      loss=loss,
                                                      train_op=train_op)
                else:
                    return tf.estimator.EstimatorSpec(
                        mode=mode, loss=loss, eval_metric_ops=eval_metrics)
            else:
                (predicted_labels,
                 log_probs) = create_model(is_predicting, input_ids,
                                           input_mask, segment_ids, label_ids,
                                           num_labels)

                predictions = {
                    'probabilities': log_probs,
                    'labels': predicted_labels
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)

        return model_fn

    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    NUM_TRAIN_EPOCHS = 3.0
    WARMUP_PROPORTION = 0.1
    SAVE_CHECKPOINTS_STEPS = 500
    SAVE_SUMMARY_STEPS = 100

    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    run_config = tf.estimator.RunConfig(
        model_dir=OUTPUT_DIR,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    model_fn = model_fn_builder(num_labels=len(label_list),
                                learning_rate=LEARNING_RATE,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={"batch_size": BATCH_SIZE})

    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)

    print('Beginning Training!')
    current_time = datetime.now()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print('Training took time ', datetime.now() - current_time)

    test_input_fn = run_classifier.input_fn_builder(features=test_features,
                                                    seq_length=MAX_SEQ_LENGTH,
                                                    is_training=False,
                                                    drop_remainder=False)

    estimator.evaluate(input_fn=test_input_fn, steps=None)

    def get_prediction(in_sentences):
        labels = ['Negative', 'Positive']
        input_examples = [
            run_classifier.InputExample(guid="",
                                        text_a=x,
                                        text_b=None,
                                        label=0) for x in in_sentences
        ]
        input_features = run_classifier.convert_examples_to_features(
            input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
        predict_input_fn = run_classifier.input_fn_builder(
            features=input_features,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)
        predictions = estimator.predict(predict_input_fn)
        return [(sentence, prediction['probabilities'],
                 labels[prediction['labels']])
                for sentence, prediction in zip(in_sentences, predictions)]

    pred_sentences = [
        "That movie was absolutely awful", "The acting was a bit lacking",
        "The film was creative and surprising", "Absolutely fantastic!"
    ]

    predictions = get_prediction(pred_sentences)

    print(predictions)