def __init__(self, args):
        self._tokenizer = tokenization.FullTokenizer(
            vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
        self._max_seq_length = args.max_seq_len
        self._doc_stride = args.doc_stride
        self._max_query_length = args.max_query_length
        self._in_tokens = args.in_tokens

        self._train_file = args.train_file
        self._predict_file = args.predict_file
        self._batch_size = args.batch_size
        self._with_negative = args.with_negative
        self._epoch = args.epoch
        self._sample_rate = args.sample_rate

        self.vocab = self._tokenizer.vocab
        self.vocab_size = len(self.vocab)
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.mask_id = self.vocab["[MASK]"]

        self.current_train_example = -1
        self.num_train_examples = -1
        self.current_train_epoch = -1

        self.train_examples = None
        self.predict_examples = None
        self.predict_features = None
        self.num_examples = {'train': -1, 'predict': -1}
Esempio n. 2
0
    def __init__(self, cfg):
        self.cfg = cfg

        self.TaskDataset = dataset_class(cfg.task)
        self.pipeline = None
        if cfg.need_prepro:
            tokenizer = tokenization.FullTokenizer(
                vocab_file=cfg.vocab, do_lower_case=cfg.do_lower_case)
            self.pipeline = [
                Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
                AddSpecialTokensWithTruncation(cfg.max_seq_length),
                TokenIndexing(tokenizer.convert_tokens_to_ids,
                              self.TaskDataset.labels, cfg.max_seq_length)
            ]

        if cfg.mode == 'train':
            self.sup_data_dir = cfg.sup_data_dir
            self.sup_batch_size = cfg.train_batch_size
            self.shuffle = True
        elif cfg.mode == 'train_eval':
            self.sup_data_dir = cfg.sup_data_dir
            self.eval_data_dir = cfg.eval_data_dir
            self.sup_batch_size = cfg.train_batch_size
            self.eval_batch_size = cfg.eval_batch_size
            self.shuffle = True
        elif cfg.mode == 'eval':
            self.sup_data_dir = cfg.eval_data_dir
            self.sup_batch_size = cfg.eval_batch_size
            self.shuffle = False  # Not shuffel when eval mode

        if cfg.uda_mode:  # Only uda_mode
            self.unsup_data_dir = cfg.unsup_data_dir
            self.unsup_batch_size = cfg.train_batch_size * cfg.unsup_ratio
Esempio n. 3
0
def predict_v2():
    """
    加载pb常量模型
    :return:
    """
    VOCAB_PATH_HZ = '/home/recsys/jixiaozhan/sansu_detect_bert/modelParams/chinese_L-12_H-768_A-12/vocab.txt'
    title = "hide new secretions from the parental units"
    model_file = "/home/jixiaozhan/EasyTransfer/scripts/knowledge_distillation/vanilla_teacher_model/tmp_model/saved_model.pb"
    tokenizer_hz = tokenization.FullTokenizer(vocab_file=VOCAB_PATH_HZ,
                                              do_lower_case=True)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    with gfile.FastGFile(model_file, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        sess.graph.as_default()
        tf.import_graph_def(graph_def, name='')

    input_ids = sess.graph.get_tensor_by_name("input_ids:0")
    input_mask = sess.graph.get_tensor_by_name("input_mask:0")
    segment_ids = sess.graph.get_tensor_by_name("segment_ids:0")
    predictions = sess.graph.get_tensor_by_name('app/ez_dense/BiasAdd:0')[0]
    example = bert_33.get_input_features(title, tokenizer_hz)
    ret = sess.run(predictions,
                   feed_dict={
                       input_ids: np.array(example['input_ids']),
                       input_mask: np.array(example['input_mask']),
                       segment_ids: np.array(example['segment_ids'])
                   })
Esempio n. 4
0
 def build_tokenizer(self, bert_layer):
     '''
     Encodees text into tokens, masks, and segment flags
     :return: tokenization wrapper
     '''
     vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
     do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
     return tokenization.FullTokenizer(vocab_file, do_lower_case)
def main():
    """ Starts the data preparation
    """
    # Loads data
    logging.info("Loading data")

    task_datasets_rename = {
        "COLA": "CoLA",
        "SST": "SST-2",
    }

    data_dir = f'data/{args.task}'
    if args.task.upper() in task_datasets_rename:
        data_dir = f'data/{task_datasets_rename[args.task]}'

    if args.output_dir is None:
        output_dir = data_dir
    else:
        output_dir = args.output_dir
    tx.utils.maybe_create_dir(output_dir)

    processors = {
        "COLA": data_utils.ColaProcessor,
        "MNLI": data_utils.MnliProcessor,
        "MRPC": data_utils.MrpcProcessor,
        "XNLI": data_utils.XnliProcessor,
        'SST': data_utils.SSTProcessor
    }
    processor = processors[args.task]()

    config_data = importlib.import_module(args.config_data)

    pretrained_model_dir = tx.modules.BERTEncoder.download_checkpoint(
        pretrained_model_name=args.pretrained_model_name)

    vocab_file = os.path.join(pretrained_model_dir, "vocab.txt")

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(data_dir))
    logging.info("num_classes: %d; num_train_data: %d", num_classes,
                 num_train_data)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=args.lower_case)

    # Produces pickled files
    data_utils.prepare_record_data(
        processor=processor,
        tokenizer=tokenizer,
        data_dir=data_dir,
        max_seq_length=args.max_seq_length,
        output_dir=output_dir,
        feature_original_types=config_data.feature_original_types)
    modify_config_data(args.max_seq_length, num_train_data, num_classes)
Esempio n. 6
0
    def __init__(self,
                 vocab_path,
                 label_map_config=None,
                 max_seq_len=512,
                 max_ent_cnt=42,
                 do_lower_case=True,
                 in_tokens=False,
                 is_inference=False,
                 random_seed=None,
                 tokenizer="FullTokenizer",
                 is_classify=True,
                 is_regression=False,
                 for_cn=True,
                 task_id=0):
        self.max_seq_len = max_seq_len
        self.max_ent_cnt = max_ent_cnt
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.in_tokens = in_tokens
        self.is_inference = is_inference
        self.for_cn = for_cn
        self.task_id = task_id

        np.random.seed(random_seed)

        self.is_classify = is_classify
        self.is_regression = is_regression
        self.current_example = 0
        self.current_epoch = 0
        self.num_examples = 0

        if label_map_config:
            with open(label_map_config, encoding='utf8') as f:
                self.label_map = json.load(f)
        else:
            self.label_map = None
        self.ner_map = {'PAD': 0, 'ORG': 1, 'LOC': 2, 'NUM': 3, 'TIME': 4, 'MISC': 5, 'PER': 6}
        distance_buckets = np.zeros((512), dtype='int64')
        distance_buckets[1] = 1
        distance_buckets[2:] = 2
        distance_buckets[4:] = 3
        distance_buckets[8:] = 4
        distance_buckets[16:] = 5
        distance_buckets[32:] = 6
        distance_buckets[64:] = 7
        distance_buckets[128:] = 8
        distance_buckets[256:] = 9
        self.distance_buckets = distance_buckets
Esempio n. 7
0
def create_tokenizer_from_hub_module(bert_hub_module_handle):
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(bert_hub_module_handle)
        tokenization_info = bert_module(signature="tokenization_info",
                                        as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"]
            ])
    return tokenization.FullTokenizer(vocab_file=vocab_file,
                                      do_lower_case=do_lower_case)
Esempio n. 8
0
    def __init__(self, args):

        self.train_file = args.train_file
        self.max_seq_len = args.max_seq_len
        self.batch_size = args.batch_size
        self.epoch = args.epoch
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.in_tokens = args.in_tokens

        self.current_train_example = -1
        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
        self.current_train_epoch = -1
Esempio n. 9
0
def prepare_data():
    """
    Builds the model and runs.
    """
    # Loads data
    logging.info("Loading data")

    task_datasets_rename = {
        "COLA": "CoLA",
        "SST": "SST-2",
    }

    data_dir = f'data/{args.task}'
    if args.task.upper() in task_datasets_rename:
        data_dir = f'data/{task_datasets_rename[args.task]}'

    if args.tfrecord_output_dir is None:
        tfrecord_output_dir = data_dir
    else:
        tfrecord_output_dir = args.tfrecord_output_dir
    tx.utils.maybe_create_dir(tfrecord_output_dir)

    processors = {
        "COLA": data_utils.ColaProcessor,
        "MNLI": data_utils.MnliProcessor,
        "MRPC": data_utils.MrpcProcessor,
        "XNLI": data_utils.XnliProcessor,
        'SST': data_utils.SSTProcessor
    }
    processor = processors[args.task]()

    from config_data import feature_original_types

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(data_dir))
    logging.info("num_classes: %d; num_train_data: %d", num_classes,
                 num_train_data)
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    # Produces TFRecord files
    data_utils.prepare_record_data(
        processor=processor,
        tokenizer=tokenizer,
        data_dir=data_dir,
        max_seq_length=args.max_seq_length,
        output_dir=tfrecord_output_dir,
        feature_original_types=feature_original_types)
    modify_config_data(args.max_seq_length, num_train_data, num_classes)
Esempio n. 10
0
def predict():
    """
    加载pb变量模型
    :return:
    """
    VOCAB_PATH_HZ = '/home/recsys/jixiaozhan/sansu_detect_bert/modelParams/chinese_L-12_H-768_A-12/vocab.txt'
    title = "hide new secretions from the parental units"
    MODEL_V2 = "/home/jixiaozhan/EasyTransfer/scripts/knowledge_distillation/vanilla_teacher_model/1607392874"
    tokenizer_hz = tokenization.FullTokenizer(vocab_file=VOCAB_PATH_HZ,
                                              do_lower_case=True)
    example = bert_33.get_input_features(title, tokenizer_hz)
    label_id = example.pop('label_ids')
    example['label_id'] = label_id

    predict_fn_hz_v2 = tf.contrib.predictor.from_saved_model(MODEL_V2)
    predict_pro_list2 = predict_fn_hz_v2(example)
Esempio n. 11
0
def main(_):

    if FLAGS.max_seq_length > 512:
        raise ValueError(
            "Cannot use sequence length {:d} because the BERT model "
            "was only trained up to sequence length {:d}".format(
                FLAGS.max_seq_length, 512))

    processor = raw_data_utils.get_processor(FLAGS.task_name)
    # Create tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if FLAGS.data_type == "sup":
        sup_out_dir = FLAGS.output_base_dir
        tf.logging.info("Create sup. data: subset {} => {}".format(
            FLAGS.sub_set, sup_out_dir))

        proc_and_save_sup_data(
            processor,
            FLAGS.sub_set,
            FLAGS.raw_data_dir,
            sup_out_dir,
            tokenizer,
            FLAGS.max_seq_length,
            FLAGS.trunc_keep_right,
            FLAGS.worker_id,
            FLAGS.replicas,
            FLAGS.sup_size,
        )
    elif FLAGS.data_type == "unsup":
        assert FLAGS.aug_ops is not None, \
            "aug_ops is required to preprocess unsupervised data."
        unsup_out_dir = os.path.join(FLAGS.output_base_dir, FLAGS.aug_ops,
                                     str(FLAGS.aug_copy_num))
        data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats")

        tf.logging.info("Create unsup. data: subset {} => {}".format(
            FLAGS.sub_set, unsup_out_dir))
        proc_and_save_unsup_data(processor, FLAGS.sub_set, FLAGS.raw_data_dir,
                                 data_stats_dir, unsup_out_dir, tokenizer,
                                 FLAGS.max_seq_length, FLAGS.trunc_keep_right,
                                 FLAGS.aug_ops, FLAGS.aug_copy_num,
                                 FLAGS.worker_id, FLAGS.replicas,
                                 FLAGS.input_file)
Esempio n. 12
0
def prepare_data():
    """
    Builds the model and runs.
    """
    # Loads data
    tf.logging.info("Loading data")

    task_datasets_rename = {
        "COLA": "CoLA",
        "SST": "SST-2",
    }

    data_dir = 'data/{}'.format(FLAGS.task)
    if FLAGS.task.upper() in task_datasets_rename:
        data_dir = 'data/{}'.format(task_datasets_rename[FLAGS.task])

    if FLAGS.tfrecords_output_dir is None:
        tfrecords_output_dir = data_dir
    else:
        tfrecords_output_dir = FLAGS.tfrecords_output_dir
    tx.utils.maybe_create_dir(tfrecords_output_dir)

    processors = {
        "COLA": data_utils.ColaProcessor,
        "MNLI": data_utils.MnliProcessor,
        "MRPC": data_utils.MrpcProcessor,
        "XNLI": data_utils.XnliProcessor,
        'SST': data_utils.SSTProcessor
    }
    processor = processors[FLAGS.task]()

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(data_dir))
    tf.logging.info('num_classes:%d; num_train_data:%d' %
                    (num_classes, num_train_data))
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # Produces TFRecords files
    data_utils.prepare_TFRecord_data(processor=processor,
                                     tokenizer=tokenizer,
                                     data_dir=data_dir,
                                     max_seq_length=FLAGS.max_seq_length,
                                     output_dir=tfrecords_output_dir)
    modify_config_data(FLAGS.max_seq_length, num_train_data, num_classes)
Esempio n. 13
0
def process_unsgetext(text: str, vocab_file, do_lower_case=True):
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    tokens_ = tokenizer.tokenize(text)
    if len(text) + 2 > seq_length:
        tokens_ = tokens_[:seq_length - 2]
    tokens = ["[CLS]"] + tokens_ + ["[SEP]"]
    n = len(tokens)
    seg_ids = [0] * n
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * n
    if n < seq_length:
        seg_ids = seg_ids + [0] * (seq_length - n)
        input_ids = input_ids + [0] * (seq_length - n)
        input_mask = input_mask + [0] * (seq_length - n)
    assert len(seg_ids) == seq_length and len(input_ids) == seq_length and len(
        input_mask) == seq_length
    return InputFeature(input_ids, input_mask, seg_ids)
Esempio n. 14
0
def file_based_convert_examples_to_features(
    examples, label_list, max_seq_length, tokenize_fn, num_passes=1,
    data_stats=None, aug_ops=None):
  """
  This is the function used to preprocess data for XLNet.
  It convert a set of `InputExample`s to a TFRecord file.
  """


  if num_passes > 1:
    examples *= num_passes

  if FLAGS.xlnet == True and aug_ops:
    logging.info("XLNet Model")
    examples = tokenize_examples(
                 examples, tokenization.FullTokenizer(do_lower_case=False))
    logging.info("building vocab")
    word_vocab = build_vocab(examples)
    logging.info("augmenting data using {}".format(aug_ops))
    examples = word_level_augment.word_level_augment(
      examples, aug_ops, word_vocab, data_stats
    )

  features = []
  for (ex_index, example) in enumerate(examples):
    if ex_index % 5000 == 0:
      tf.logging.info("Writing example {} of {}".format(ex_index,
                                                        len(examples)))

    feature = convert_single_example(ex_index, example, label_list,
                                     max_seq_length, tokenize_fn, aug_ops)


    features.append(
       InputFeaturesXL(
           input_ids=feature.input_ids,
           input_mask=feature.input_mask,
           segment_ids=feature.segment_ids,
           label_ids=feature.label_id,
           is_real_example=int(feature.is_real_example)))

  return features
Esempio n. 15
0
    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            if six.PY2:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
            else:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                            ]).encode("utf-8"))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertAllEqual(tokens,
                            ["un", "##want", "##ed", ",", "runn", "##ing"])

        self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens),
                            [7, 4, 5, 10, 8, 9])
Esempio n. 16
0
use_one_hot_embeddings = False
params_senteval['classifier'] = {
    'nhid': nhid,
    'optim': 'adam',
    'batch_size': 64,
    'tenacity': 5,
    'epoch_size': 4
}

tf.logging.set_verbosity(tf.logging.INFO)

layer_indexes = [layers]

bert_config = modeling.BertConfig.from_json_file(bert_config_file)

tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    master=master,
    tpu_config=tf.contrib.tpu.TPUConfig(
        num_shards=num_tpu_cores, per_host_input_for_training=is_per_host))
#####bert


class InputExample(object):
    def __init__(self, unique_id, text_a, text_b):
        self.unique_id = unique_id
        self.text_a = text_a
        self.text_b = text_b
Esempio n. 17
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
Esempio n. 18
0
def main(_):


  if FLAGS.max_seq_length > 512:
    raise ValueError(
        "Cannot use sequence length {:d} because the BERT model "
        "was only trained up to sequence length {:d}".format(
            FLAGS.max_seq_length, 512))

  processor = raw_data_utils.get_processor(FLAGS.task_name)

  if FLAGS.xlnet == False:
    # Create tokenizer
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
  else:
    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)
    def tokenize_fn(text):
      text = preprocess_text(text, lower=False)
      return encode_ids(sp, text)

  if FLAGS.data_type == "sup":
    sup_out_dir = FLAGS.output_base_dir
    logging.info("Create sup. data: subset {} => {}".format(
        FLAGS.sub_set, sup_out_dir))
    if FLAGS.xlnet == True:

      proc_and_save_sup_data_xlnet(
          processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir,
          tokenize_fn, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
          FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size,
      )

    else:
      proc_and_save_sup_data(
          processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir,
          tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
          FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size,
      )
  elif FLAGS.data_type == "unsup":
    assert FLAGS.aug_ops is not None, \
        "aug_ops is required to preprocess unsupervised data."
    unsup_out_dir = os.path.join(
        FLAGS.output_base_dir,
        FLAGS.aug_ops,
        str(FLAGS.aug_copy_num))
    data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats")


    logging.info("Create unsup. data: subset {} => {}".format(
        FLAGS.sub_set, unsup_out_dir))
    if FLAGS.xlnet == True:
      proc_and_save_unsup_data_xlnet(
            processor, FLAGS.sub_set,
            FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir,
            tokenize_fn, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
            FLAGS.aug_ops, FLAGS.aug_copy_num,
            FLAGS.worker_id, FLAGS.replicas)
    else:
      proc_and_save_unsup_data(
            processor, FLAGS.sub_set,
            FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir,
            tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
            FLAGS.aug_ops, FLAGS.aug_copy_num,
            FLAGS.worker_id, FLAGS.replicas)
Esempio n. 19
0
    args.n_gpu = 1

    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')

# Log GPU information
logger.add_text('info', f"args: {args}")

# Modify batch size if accumulating gradients
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

# Reproducibility
utils.set_seeds(args.seed, multi_gpu=args.n_gpu > 0)

# Build dataloaders
tokenizer = tokenization.FullTokenizer(args.vocab,
                                       do_lower_case=args.do_lower_case)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))
pipeline = [
    PipelineForPretrain(
        max_pred=20,  # what is this?
        mask_prob=0.15,  # actually this does nothing
        vocab_words=list(tokenizer.vocab.keys()),  # 
        indexer=tokenizer.convert_tokens_to_ids,
        max_len=args.max_seq_length)
]
dataloader = SentencePairDataLoader(args.text_file,
                                    batch_size=args.train_batch_size,
                                    tokenize=tokenize,
                                    max_len=args.max_seq_length,
                                    pipeline=pipeline)
Esempio n. 20
0
                        help='sequence length (default: 32)')
    parser.add_argument('--input_file',
                        type=str,
                        default="",
                        metavar='STRING',
                        help='input file path')
    parser.add_argument('--vocab_file',
                        type=str,
                        default="",
                        metavar='STRING',
                        help='vocab file path')
    args = parser.parse_args()

    input_file = args.input_file
    max_seq_length = args.max_seq_length
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    with tf.gfile.Open(input_file, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        for line in reader:
            text_a = tokenization.convert_to_unicode(line[3])
            text_b = tokenization.convert_to_unicode(line[4])

            a_input_ids, a_input_mask, a_segment_ids = convert_single_example(
                text_a,
                None,
                max_seq_length=max_seq_length,
                tokenizer=tokenizer)
            b_input_ids, b_input_mask, b_segment_ids = convert_single_example(
                text_b,
                None,
                max_seq_length=max_seq_length,
Esempio n. 21
0
 def __init__(self, vocab_file, do_lower_case, max_seq_len):
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=do_lower_case)
     self.max_seq_len = max_seq_len
Esempio n. 22
0
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'y_end': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)}
    '''
    with open(save_path, 'wb') as f:
        pickle.dump(meta, f)


if __name__ == '__main__':

    # Load tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file='glove.42B.300d.txt1',
                                           do_lower_case=False)

    train_examples = read_squad_examples(
        input_file='original_data/train_sample.json', is_training=True)
    dev_examples = read_squad_examples(
        input_file='original_data/train_sample.json', is_training=False)

    train_features = convert_examples_to_features(train_examples,
                                                  tokenizer,
                                                  max_seq_length=400,
                                                  max_query_length=50,
                                                  is_training=True)
    dev_features = convert_examples_to_features(dev_examples,
                                                tokenizer,
                                                max_seq_length=400,
                                                max_query_length=50,
Esempio n. 23
0
def define_train_eval_input_fn():

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mismnli": MisMnliProcessor,
        "mrpc": MrpcProcessor,
        "rte": RteProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "wnli": WnliProcessor,
    }
    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    train_examples = processor.get_train_examples(FLAGS.data_dir)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    if (tf.gfile.Exists(train_file) == False):
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    num_actual_eval_examples = len(eval_examples)

    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    if (tf.gfile.Exists(eval_file) == False):
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(eval_examples), num_actual_eval_examples,
                    len(eval_examples) - num_actual_eval_examples)
    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    # This tells the estimator to run through the entire set.
    eval_steps = None
    # However, if running eval on the TPU, you will need to specify the
    # number of steps.
    if FLAGS.use_tpu:
        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

    eval_drop_remainder = True if FLAGS.use_tpu else False
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder)

    return label_list, train_input_fn, num_train_steps, eval_input_fn, eval_steps, num_warmup_steps
Esempio n. 24
0
def main(_):
    hvd.init()
    FLAGS.model_dir = FLAGS.model_dir if hvd.rank() == 0 else os.path.join(
        FLAGS.model_dir, str(hvd.rank()))
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    #FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
    #FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()

    tf.logging.set_verbosity(tf.logging.INFO)

    processor = raw_data_utils.get_processor(FLAGS.task_name)
    label_list = processor.get_labels()

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file,
                                                     FLAGS.model_dropout)

    tf.gfile.MakeDirs(FLAGS.model_dir)

    flags_dict = tf.app.flags.FLAGS.flag_values_dict()
    with tf.gfile.Open(os.path.join(FLAGS.model_dir, "FLAGS.json"),
                       "w") as ouf:
        json.dump(flags_dict, ouf)

    tf.logging.info("warmup steps {}/{}".format(FLAGS.num_warmup_steps,
                                                FLAGS.num_train_steps))

    save_checkpoints_steps = 500  #FLAGS.num_train_steps // FLAGS.save_checkpoints_num
    tf.logging.info("setting save checkpoints steps to {:d}".format(
        save_checkpoints_steps))

    FLAGS.iterations_per_loop = min(save_checkpoints_steps,
                                    FLAGS.iterations_per_loop)
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None
    # if not FLAGS.use_tpu and FLAGS.num_gpu > 1:
    #   train_distribute = tf.contrib.distribute.MirroredStrategy(
    #       num_gpus=FLAGS.num_gpu)
    # else:
    #   train_distribute = None

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=1,
        # train_distribute=train_distribute,
        session_config=config,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=is_per_host))

    model_fn = uda.model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        clip_norm=FLAGS.clip_norm,
        num_train_steps=FLAGS.num_train_steps,
        num_warmup_steps=FLAGS.num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings,
        num_labels=len(label_list),
        unsup_ratio=FLAGS.unsup_ratio,
        uda_coeff=FLAGS.uda_coeff,
        tsa=FLAGS.tsa,
        print_feature=False,
        print_structure=False,
    )

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        params={"model_dir": FLAGS.model_dir},
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("  >>> sup data dir : {}".format(
            FLAGS.sup_train_data_dir))
        if FLAGS.unsup_ratio > 0:
            tf.logging.info("  >>> unsup data dir : {}".format(
                FLAGS.unsup_data_dir))

        train_input_fn = proc_data_utils.training_input_fn_builder(
            FLAGS.sup_train_data_dir, FLAGS.unsup_data_dir, FLAGS.aug_ops,
            FLAGS.aug_copy, FLAGS.unsup_ratio)
        train_size = processor.get_train_size(FLAGS.raw_data_dir)
        train_steps = int(train_size / FLAGS.train_batch_size)

    if FLAGS.do_eval:
        tf.logging.info("  >>> dev data dir : {}".format(FLAGS.eval_data_dir))
        eval_input_fn = proc_data_utils.evaluation_input_fn_builder(
            FLAGS.eval_data_dir, "clas")

        eval_size = processor.get_dev_size(FLAGS.raw_data_dir)
        eval_steps = int(eval_size / FLAGS.eval_batch_size)

        train_eval_input_fn = proc_data_utils.evaluation_input_fn_builder(
            FLAGS.sup_train_data_dir, "clas")

    if FLAGS.do_train and FLAGS.do_eval:
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        tf.logging.info("***** Running training & evaluation *****")
        tf.logging.info("  Supervised batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Unsupervised batch size = %d",
                        FLAGS.train_batch_size * FLAGS.unsup_ratio)
        tf.logging.info("  training size = %d", train_size)
        tf.logging.info("  training num steps = %d", train_steps)
        tf.logging.info("  evaluation batch size = %d", FLAGS.eval_batch_size)
        tf.logging.info("  dev num steps = %d", eval_steps)
        best_acc = 0
        for _ in range(0, FLAGS.num_train_steps, save_checkpoints_steps):
            tf.logging.info("*** Running training ***")
            estimator.train(input_fn=train_input_fn,
                            steps=save_checkpoints_steps,
                            hooks=hooks)
            tf.logging.info("*** Running evaluation ***")

            train_result = estimator.evaluate(input_fn=train_eval_input_fn,
                                              steps=train_steps)
            tf.logging.info(">> Train Results:")
            for key in train_result.keys():
                tf.logging.info("  %s = %s", key, str(train_result[key]))
                train_result[key] = train_result[key].item()
            dev_result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps)
            tf.logging.info(">> Results:")
            for key in dev_result.keys():
                tf.logging.info("  %s = %s", key, str(dev_result[key]))
                dev_result[key] = dev_result[key].item()
            best_acc = max(best_acc, dev_result["eval_precision"])
        tf.logging.info("***** Final evaluation result *****")
        tf.logging.info("Best acc: {:.3f}\n\n".format(best_acc))
    elif FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Supervised batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Unsupervised batch size = %d",
                        FLAGS.train_batch_size * FLAGS.unsup_ratio)
        tf.logging.info("  Num steps = %d", FLAGS.num_train_steps)
        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps)
    elif FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Base evaluation batch size = %d",
                        FLAGS.eval_batch_size)
        tf.logging.info("  Num steps = %d", eval_steps)
        checkpoint_state = tf.train.get_checkpoint_state(FLAGS.model_dir)

        best_acc = 0
        for ckpt_path in checkpoint_state.all_model_checkpoint_paths:
            if not tf.gfile.Exists(ckpt_path + ".data-00000-of-00001"):
                tf.logging.info(
                    "Warning: checkpoint {:s} does not exist".format(
                        ckpt_path))
                continue
            tf.logging.info("Evaluating {:s}".format(ckpt_path))
            dev_result = estimator.evaluate(
                input_fn=eval_input_fn,
                steps=eval_steps,
                checkpoint_path=ckpt_path,
            )
            tf.logging.info(">> Results:")
            for key in dev_result.keys():
                tf.logging.info("  %s = %s", key, str(dev_result[key]))
                dev_result[key] = dev_result[key].item()
            best_acc = max(best_acc, dev_result["eval_precision"])
        tf.logging.info("***** Final evaluation result *****")
        tf.logging.info("Best acc: {:.3f}\n\n".format(best_acc))
        from utils import tokenization
        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                               do_lower_case=True)
        id2label = dict(zip([i for i in range(len(label_list))], label_list))
        result = estimator.predict(input_fn=eval_input_fn)
        output_line = ""
        with open("label_test.txt", 'w') as writer:
            for re in result:
                sentence = re["input_ids"]
                gold = re["label_ids"]
                prediction = re["predict"]
                # output_line = "\n".join(id2label[id] for id in prediction if id!=0) + "\n"
                for gold_index, gold_item in enumerate(gold):
                    if gold_item >= 34:
                        gold[gold_index] = 0
                for gold_index, gold_item in enumerate(prediction):
                    if gold_item >= 34:
                        gold[gold_index] = 0
                for w, gold_label, label in zip(
                        tokenizer.convert_ids_to_tokens([
                            int(s) for s in sentence
                        ]), [id2label[id] for id in gold],
                    [id2label[id] for id in prediction]):
                    if w == "[PAD]":
                        continue
                    #if label=="NEGATIVE":
                    #  continue
                    output_line = output_line + w + " " + gold_label + " " + label + "\n"
                output_line += "\n"
            writer.write(output_line)
Esempio n. 25
0
def main(_):
    """
    Builds the model and runs.
    """
    tf.logging.set_verbosity(tf.logging.INFO)
    tx.utils.maybe_create_dir(FLAGS.output_dir)
    bert_pretrain_dir = 'bert_pretrained_models/%s' % FLAGS.config_bert_pretrain

    # Loads BERT model configuration
    if FLAGS.config_format_bert == "json":
        bert_config = model_utils.transform_bert_to_texar_config(
            os.path.join(bert_pretrain_dir, 'bert_config.json'))
    elif FLAGS.config_format_bert == 'texar':
        bert_config = importlib.import_module(
            'bert_config_lib.config_model_%s' % FLAGS.config_bert_pretrain)
    else:
        raise ValueError('Unknown config_format_bert.')

    # Loads data
    processors = {
        "cola": data_utils.ColaProcessor,
        "mnli": data_utils.MnliProcessor,
        "mrpc": data_utils.MrpcProcessor,
        "xnli": data_utils.XnliProcessor,
        'sst': data_utils.SSTProcessor
    }

    processor = processors[FLAGS.task.lower()]()

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(config_data.data_dir))

    tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(
        bert_pretrain_dir, 'vocab.txt'),
                                           do_lower_case=FLAGS.do_lower_case)

    train_dataset = data_utils.get_dataset(processor,
                                           tokenizer,
                                           config_data.data_dir,
                                           config_data.max_seq_length,
                                           config_data.train_batch_size,
                                           mode='train',
                                           output_dir=FLAGS.output_dir)
    eval_dataset = data_utils.get_dataset(processor,
                                          tokenizer,
                                          config_data.data_dir,
                                          config_data.max_seq_length,
                                          config_data.eval_batch_size,
                                          mode='eval',
                                          output_dir=FLAGS.output_dir)
    test_dataset = data_utils.get_dataset(processor,
                                          tokenizer,
                                          config_data.data_dir,
                                          config_data.max_seq_length,
                                          config_data.test_batch_size,
                                          mode='test',
                                          output_dir=FLAGS.output_dir)

    iterator = tx.data.FeedableDataIterator({
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_dataset
    })
    batch = iterator.get_next()
    input_ids = batch["input_ids"]
    segment_ids = batch["segment_ids"]
    batch_size = tf.shape(input_ids)[0]
    input_length = tf.reduce_sum(1 - tf.to_int32(tf.equal(input_ids, 0)),
                                 axis=1)

    # Builds BERT
    with tf.variable_scope('bert'):
        embedder = tx.modules.WordEmbedder(vocab_size=bert_config.vocab_size,
                                           hparams=bert_config.embed)
        word_embeds = embedder(input_ids)

        # Creates segment embeddings for each type of tokens.
        segment_embedder = tx.modules.WordEmbedder(
            vocab_size=bert_config.type_vocab_size,
            hparams=bert_config.segment_embed)
        segment_embeds = segment_embedder(segment_ids)

        input_embeds = word_embeds + segment_embeds

        # The BERT model (a TransformerEncoder)
        encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder)
        output = encoder(input_embeds, input_length)

        # Builds layers for downstream classification, which is also initialized
        # with BERT pre-trained checkpoint.
        with tf.variable_scope("pooler"):
            # Uses the projection of the 1st-step hidden vector of BERT output
            # as the representation of the sentence
            bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1)
            bert_sent_output = tf.layers.dense(bert_sent_hidden,
                                               config_downstream.hidden_dim,
                                               activation=tf.tanh)
            output = tf.layers.dropout(bert_sent_output,
                                       rate=0.1,
                                       training=tx.global_mode_train())

    # Adds the final classification layer
    logits = tf.layers.dense(
        output,
        num_classes,
        kernel_initializer=tf.truncated_normal_initializer(stddev=0.02))
    preds = tf.argmax(logits, axis=-1, output_type=tf.int32)
    accu = tx.evals.accuracy(batch['label_ids'], preds)

    # Optimization

    loss = tf.losses.sparse_softmax_cross_entropy(labels=batch["label_ids"],
                                                  logits=logits)
    global_step = tf.Variable(0, trainable=False)

    # Builds learning rate decay scheduler
    static_lr = config_downstream.lr['static_lr']
    num_train_steps = int(num_train_data / config_data.train_batch_size *
                          config_data.max_train_epoch)
    num_warmup_steps = int(num_train_steps * config_data.warmup_proportion)
    lr = model_utils.get_lr(
        global_step,
        num_train_steps,  # lr is a Tensor
        num_warmup_steps,
        static_lr)

    train_op = tx.core.get_train_op(loss,
                                    global_step=global_step,
                                    learning_rate=lr,
                                    hparams=config_downstream.opt)

    # Train/eval/test routine

    def _run(sess, mode):
        fetches = {
            'accu': accu,
            'batch_size': batch_size,
            'step': global_step,
            'loss': loss,
        }

        if mode == 'train':
            fetches['train_op'] = train_op
            while True:
                try:
                    feed_dict = {
                        iterator.handle: iterator.get_handle(sess, 'train'),
                        tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
                    }
                    rets = sess.run(fetches, feed_dict)
                    if rets['step'] % 50 == 0:
                        tf.logging.info('step:%d loss:%f' %
                                        (rets['step'], rets['loss']))
                    if rets['step'] == num_train_steps:
                        break
                except tf.errors.OutOfRangeError:
                    break

        if mode == 'eval':
            cum_acc = 0.0
            nsamples = 0
            while True:
                try:
                    feed_dict = {
                        iterator.handle: iterator.get_handle(sess, 'eval'),
                        tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
                    }
                    rets = sess.run(fetches, feed_dict)

                    cum_acc += rets['accu'] * rets['batch_size']
                    nsamples += rets['batch_size']
                except tf.errors.OutOfRangeError:
                    break

            tf.logging.info('dev accu: {}'.format(cum_acc / nsamples))

        if mode == 'test':
            _all_preds = []
            while True:
                try:
                    feed_dict = {
                        iterator.handle: iterator.get_handle(sess, 'test'),
                        tx.context.global_mode():
                        tf.estimator.ModeKeys.PREDICT,
                    }
                    _preds = sess.run(preds, feed_dict=feed_dict)
                    _all_preds.extend(_preds.tolist())
                except tf.errors.OutOfRangeError:
                    break

            output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
            with tf.gfile.GFile(output_file, "w") as writer:
                writer.write('\n'.join(str(p) for p in _all_preds))

    with tf.Session() as sess:
        # Loads pretrained BERT model parameters
        init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt')
        model_utils.init_bert_checkpoint(init_checkpoint)

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        # Restores trained model if specified
        saver = tf.train.Saver()
        if FLAGS.checkpoint:
            saver.restore(sess, FLAGS.checkpoint)

        iterator.initialize_dataset(sess)

        if FLAGS.do_train:
            iterator.restart_dataset(sess, 'train')
            _run(sess, mode='train')
            saver.save(sess, FLAGS.output_dir + '/model.ckpt')

        if FLAGS.do_eval:
            iterator.restart_dataset(sess, 'eval')
            _run(sess, mode='eval')

        if FLAGS.do_test:
            iterator.restart_dataset(sess, 'test')
            _run(sess, mode='test')
#!/usr/bin/python3
Esempio n. 27
0
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(
                value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_id"] = create_int_feature([feature.label_id])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()


if __name__ == '__main__':
    max_seq_length = 64
    tokenizer = tokenization.FullTokenizer(
        "/home/geb/PycharmProjects/bert_ngc/vocab_file/albert_zh/vocab.txt")
    sc = SentenceClassifierProcessor()
    examples = sc.get_train_examples("/home/geb/PycharmProjects/bert/data_dir")
    file_based_convert_examples_to_features(
        examples, sc.get_labels(), max_seq_length, tokenizer,
        "../tf_records/sentence_classifier/train.record0")
Esempio n. 28
0
        'context_id': context_idxss,
        'question_id': ques_idxss,
        'context_char_id': context_char_idxss,
        'question_char_id': ques_char_idxss,
        'y_start': y1s,
        'y_end': y2s
    }
    print('save to', save_path, len(qids), 'features')
    with open(save_path, 'wb') as f:
        pickle.dump(meta, f)


if __name__ == '__main__':

    # Load tokenizer
    tokenizer = tokenization.FullTokenizer(
        vocab_file='original_data/glove.840B.300d.txt', do_lower_case=False)
    train_examples = read_squad_examples(
        input_file='original_data/train-v1.1.json', is_training=True)
    dev_examples = read_squad_examples(
        input_file='original_data/dev-v1.1.json', is_training=False)

    train_features = convert_examples_to_features(train_examples,
                                                  tokenizer,
                                                  max_seq_length=400,
                                                  max_query_length=50,
                                                  is_training=True)
    dev_features = convert_examples_to_features(dev_examples,
                                                tokenizer,
                                                max_seq_length=400,
                                                max_query_length=50,
                                                is_training=False)
Esempio n. 29
0
def proc_and_save_unsup_data_xlnet(
    processor, sub_set,
    raw_data_dir, data_stats_dir, unsup_out_dir,
    tokenize_fn,
    max_seq_length, trunc_keep_right,
    aug_ops, aug_copy_num,
    worker_id, replicas):
  # print random seed just to double check that we use different random seeds
  # for different runs so that we generate different augmented examples for the same original example.
  random_seed = np.random.randint(0, 100000)
  logging.info("random seed: {:d}".format(random_seed))
  np.random.seed(random_seed)
  logging.info("getting examples")

  if sub_set == "train":
    ori_examples = processor.get_train_examples(raw_data_dir)
  elif sub_set.startswith("unsup"):
    print(sub_set)
    ori_examples = processor.get_unsup_examples(raw_data_dir, sub_set)
  else:
    assert False
  # this is the size before spliting data for each worker
  data_total_size = len(ori_examples)
  if replicas != -1:
    ori_examples, start, end = get_data_for_worker(
        ori_examples, replicas, worker_id)
  else:
    start = 0
    end = len(ori_examples)

  logging.info("getting augmented examples")
  aug_examples = copy.deepcopy(ori_examples)

  # Doesn't do anything for tf-idf augmentation
  aug_examples = sent_level_augment.run_augment(
      aug_examples, aug_ops, sub_set,
      aug_copy_num,
      start, end, data_total_size)

  labels = processor.get_labels() + ["unsup"]
  logging.info("processing ori examples with labels: {}".format(labels))

  ori_features = file_based_convert_examples_to_features(
      ori_examples, labels, max_seq_length,
      tokenize_fn, num_passes=1)

  tokenized_ori_examples = tokenize_examples(
               ori_examples, tokenization.FullTokenizer(do_lower_case=False))

  if "idf" in aug_ops:
    data_stats = get_data_stats(
        data_stats_dir, sub_set,
        -1, replicas, tokenized_ori_examples)
  else:
    data_stats = None

  logging.info("processing aug examples using aug ops {}".format(aug_ops))

  aug_features = file_based_convert_examples_to_features(
      aug_examples, labels, max_seq_length,
      tokenize_fn, num_passes=1, data_stats=data_stats, aug_ops=aug_ops)

  logging.info("{} Original Features".format(len(ori_features)))
  logging.info("{} Augmented Features".format(len(aug_features)))
  unsup_features = []

  for ori_feat, aug_feat in zip(ori_features, aug_features):
    unsup_features.append(PairedUnsupInputFeaturesXL(
        ori_feat.input_ids,
        ori_feat.input_mask,
        ori_feat.segment_ids,
        ori_feat.is_real_example,
        aug_feat.input_ids,
        aug_feat.input_mask,
        aug_feat.segment_ids,
        aug_feat.is_real_example
        ))
  logging.info("There are {} total unsupervised records".format(len(unsup_features)))
  dump_tfrecord(unsup_features, unsup_out_dir, worker_id)
Esempio n. 30
0
import sys, os
sys.path.append(os.getcwd())
import glob
import tensorflow.compat.v1 as tf
import numpy as np
import cv2
import argparse
import time
import traceback
import json
import utils.tokenization as tokenization
from utils.train_util import get_label_name_dict
from src.feats_extract.multimodal_feature_extract import MultiModalFeatureExtract

#################Inference Utils#################
tokenizer = tokenization.FullTokenizer(
    vocab_file='pretrain_models/robert/chinese_L-12_H-768_A-12/vocab.txt')


class TaggingModel():
    def __init__(self, configs):
        tag_id_file = configs.get('tag_id_file', None)
        model_pb = configs.get('model_pb', None)
        if tag_id_file is None:
            raise
        else:
            self.label_name_dict = get_label_name_dict(tag_id_file, None)
        if model_pb is None:
            raise
        else:
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True