Esempi in Python per FullTokenizer, esempi in Python per utils.tokenization.FullTokenizer

Esempio n. 1

0

Mostra file

File: reading_comprehension_reader.py Progetto: 0YuanZhang0/PALM

    def __init__(self, args):
        self._tokenizer = tokenization.FullTokenizer(
            vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
        self._max_seq_length = args.max_seq_len
        self._doc_stride = args.doc_stride
        self._max_query_length = args.max_query_length
        self._in_tokens = args.in_tokens

        self._train_file = args.train_file
        self._predict_file = args.predict_file
        self._batch_size = args.batch_size
        self._with_negative = args.with_negative
        self._epoch = args.epoch
        self._sample_rate = args.sample_rate

        self.vocab = self._tokenizer.vocab
        self.vocab_size = len(self.vocab)
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.mask_id = self.vocab["[MASK]"]

        self.current_train_example = -1
        self.num_train_examples = -1
        self.current_train_epoch = -1

        self.train_examples = None
        self.predict_examples = None
        self.predict_features = None
        self.num_examples = {'train': -1, 'predict': -1}

Esempio n. 2

0

Mostra file

    def __init__(self, cfg):
        self.cfg = cfg

        self.TaskDataset = dataset_class(cfg.task)
        self.pipeline = None
        if cfg.need_prepro:
            tokenizer = tokenization.FullTokenizer(
                vocab_file=cfg.vocab, do_lower_case=cfg.do_lower_case)
            self.pipeline = [
                Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
                AddSpecialTokensWithTruncation(cfg.max_seq_length),
                TokenIndexing(tokenizer.convert_tokens_to_ids,
                              self.TaskDataset.labels, cfg.max_seq_length)
            ]

        if cfg.mode == 'train':
            self.sup_data_dir = cfg.sup_data_dir
            self.sup_batch_size = cfg.train_batch_size
            self.shuffle = True
        elif cfg.mode == 'train_eval':
            self.sup_data_dir = cfg.sup_data_dir
            self.eval_data_dir = cfg.eval_data_dir
            self.sup_batch_size = cfg.train_batch_size
            self.eval_batch_size = cfg.eval_batch_size
            self.shuffle = True
        elif cfg.mode == 'eval':
            self.sup_data_dir = cfg.eval_data_dir
            self.sup_batch_size = cfg.eval_batch_size
            self.shuffle = False  # Not shuffel when eval mode

        if cfg.uda_mode:  # Only uda_mode
            self.unsup_data_dir = cfg.unsup_data_dir
            self.unsup_batch_size = cfg.train_batch_size * cfg.unsup_ratio

Esempio n. 3

0

Mostra file

File: freeze_v2.py Progetto: jxz542189/query_category

def predict_v2():
    """
    加载pb常量模型
    :return:
    """
    VOCAB_PATH_HZ = '/home/recsys/jixiaozhan/sansu_detect_bert/modelParams/chinese_L-12_H-768_A-12/vocab.txt'
    title = "hide new secretions from the parental units"
    model_file = "/home/jixiaozhan/EasyTransfer/scripts/knowledge_distillation/vanilla_teacher_model/tmp_model/saved_model.pb"
    tokenizer_hz = tokenization.FullTokenizer(vocab_file=VOCAB_PATH_HZ,
                                              do_lower_case=True)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    with gfile.FastGFile(model_file, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        sess.graph.as_default()
        tf.import_graph_def(graph_def, name='')

    input_ids = sess.graph.get_tensor_by_name("input_ids:0")
    input_mask = sess.graph.get_tensor_by_name("input_mask:0")
    segment_ids = sess.graph.get_tensor_by_name("segment_ids:0")
    predictions = sess.graph.get_tensor_by_name('app/ez_dense/BiasAdd:0')[0]
    example = bert_33.get_input_features(title, tokenizer_hz)
    ret = sess.run(predictions,
                   feed_dict={
                       input_ids: np.array(example['input_ids']),
                       input_mask: np.array(example['input_mask']),
                       segment_ids: np.array(example['segment_ids'])
                   })

Esempio n. 4

0

Mostra file

 def build_tokenizer(self, bert_layer):
     '''
     Encodees text into tokens, masks, and segment flags
     :return: tokenization wrapper
     '''
     vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
     do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
     return tokenization.FullTokenizer(vocab_file, do_lower_case)

Esempio n. 5

0

Mostra file

File: prepare_data.py Progetto: DrSnowbird/texar-pytorch-docker

def main():
    """ Starts the data preparation
    """
    # Loads data
    logging.info("Loading data")

    task_datasets_rename = {
        "COLA": "CoLA",
        "SST": "SST-2",
    }

    data_dir = f'data/{args.task}'
    if args.task.upper() in task_datasets_rename:
        data_dir = f'data/{task_datasets_rename[args.task]}'

    if args.output_dir is None:
        output_dir = data_dir
    else:
        output_dir = args.output_dir
    tx.utils.maybe_create_dir(output_dir)

    processors = {
        "COLA": data_utils.ColaProcessor,
        "MNLI": data_utils.MnliProcessor,
        "MRPC": data_utils.MrpcProcessor,
        "XNLI": data_utils.XnliProcessor,
        'SST': data_utils.SSTProcessor
    }
    processor = processors[args.task]()

    config_data = importlib.import_module(args.config_data)

    pretrained_model_dir = tx.modules.BERTEncoder.download_checkpoint(
        pretrained_model_name=args.pretrained_model_name)

    vocab_file = os.path.join(pretrained_model_dir, "vocab.txt")

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(data_dir))
    logging.info("num_classes: %d; num_train_data: %d", num_classes,
                 num_train_data)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=args.lower_case)

    # Produces pickled files
    data_utils.prepare_record_data(
        processor=processor,
        tokenizer=tokenizer,
        data_dir=data_dir,
        max_seq_length=args.max_seq_length,
        output_dir=output_dir,
        feature_original_types=config_data.feature_original_types)
    modify_config_data(args.max_seq_length, num_train_data, num_classes)

Esempio n. 6

0

Mostra file

File: dataset.py Progetto: zzg-971030/Research

    def __init__(self,
                 vocab_path,
                 label_map_config=None,
                 max_seq_len=512,
                 max_ent_cnt=42,
                 do_lower_case=True,
                 in_tokens=False,
                 is_inference=False,
                 random_seed=None,
                 tokenizer="FullTokenizer",
                 is_classify=True,
                 is_regression=False,
                 for_cn=True,
                 task_id=0):
        self.max_seq_len = max_seq_len
        self.max_ent_cnt = max_ent_cnt
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.in_tokens = in_tokens
        self.is_inference = is_inference
        self.for_cn = for_cn
        self.task_id = task_id

        np.random.seed(random_seed)

        self.is_classify = is_classify
        self.is_regression = is_regression
        self.current_example = 0
        self.current_epoch = 0
        self.num_examples = 0

        if label_map_config:
            with open(label_map_config, encoding='utf8') as f:
                self.label_map = json.load(f)
        else:
            self.label_map = None
        self.ner_map = {'PAD': 0, 'ORG': 1, 'LOC': 2, 'NUM': 3, 'TIME': 4, 'MISC': 5, 'PER': 6}
        distance_buckets = np.zeros((512), dtype='int64')
        distance_buckets[1] = 1
        distance_buckets[2:] = 2
        distance_buckets[4:] = 3
        distance_buckets[8:] = 4
        distance_buckets[16:] = 5
        distance_buckets[32:] = 6
        distance_buckets[64:] = 7
        distance_buckets[128:] = 8
        distance_buckets[256:] = 9
        self.distance_buckets = distance_buckets

Esempio n. 7

0

Mostra file

def create_tokenizer_from_hub_module(bert_hub_module_handle):
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(bert_hub_module_handle)
        tokenization_info = bert_module(signature="tokenization_info",
                                        as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"]
            ])
    return tokenization.FullTokenizer(vocab_file=vocab_file,
                                      do_lower_case=do_lower_case)

Esempio n. 8

0

Mostra file

File: answer_matching_reader.py Progetto: wdxwj/PMLM

    def __init__(self, args):

        self.train_file = args.train_file
        self.max_seq_len = args.max_seq_len
        self.batch_size = args.batch_size
        self.epoch = args.epoch
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.in_tokens = args.in_tokens

        self.current_train_example = -1
        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
        self.current_train_epoch = -1

Esempio n. 9

0

Mostra file

File: prepare_data.py Progetto: whs1111/texar-pytorch

def prepare_data():
    """
    Builds the model and runs.
    """
    # Loads data
    logging.info("Loading data")

    task_datasets_rename = {
        "COLA": "CoLA",
        "SST": "SST-2",
    }

    data_dir = f'data/{args.task}'
    if args.task.upper() in task_datasets_rename:
        data_dir = f'data/{task_datasets_rename[args.task]}'

    if args.tfrecord_output_dir is None:
        tfrecord_output_dir = data_dir
    else:
        tfrecord_output_dir = args.tfrecord_output_dir
    tx.utils.maybe_create_dir(tfrecord_output_dir)

    processors = {
        "COLA": data_utils.ColaProcessor,
        "MNLI": data_utils.MnliProcessor,
        "MRPC": data_utils.MrpcProcessor,
        "XNLI": data_utils.XnliProcessor,
        'SST': data_utils.SSTProcessor
    }
    processor = processors[args.task]()

    from config_data import feature_original_types

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(data_dir))
    logging.info("num_classes: %d; num_train_data: %d", num_classes,
                 num_train_data)
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    # Produces TFRecord files
    data_utils.prepare_record_data(
        processor=processor,
        tokenizer=tokenizer,
        data_dir=data_dir,
        max_seq_length=args.max_seq_length,
        output_dir=tfrecord_output_dir,
        feature_original_types=feature_original_types)
    modify_config_data(args.max_seq_length, num_train_data, num_classes)

Esempio n. 10

0

Mostra file

File: freeze_v2.py Progetto: jxz542189/query_category

def predict():
    """
    加载pb变量模型
    :return:
    """
    VOCAB_PATH_HZ = '/home/recsys/jixiaozhan/sansu_detect_bert/modelParams/chinese_L-12_H-768_A-12/vocab.txt'
    title = "hide new secretions from the parental units"
    MODEL_V2 = "/home/jixiaozhan/EasyTransfer/scripts/knowledge_distillation/vanilla_teacher_model/1607392874"
    tokenizer_hz = tokenization.FullTokenizer(vocab_file=VOCAB_PATH_HZ,
                                              do_lower_case=True)
    example = bert_33.get_input_features(title, tokenizer_hz)
    label_id = example.pop('label_ids')
    example['label_id'] = label_id

    predict_fn_hz_v2 = tf.contrib.predictor.from_saved_model(MODEL_V2)
    predict_pro_list2 = predict_fn_hz_v2(example)

Esempio n. 11

0

Mostra file

def main(_):

    if FLAGS.max_seq_length > 512:
        raise ValueError(
            "Cannot use sequence length {:d} because the BERT model "
            "was only trained up to sequence length {:d}".format(
                FLAGS.max_seq_length, 512))

    processor = raw_data_utils.get_processor(FLAGS.task_name)
    # Create tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if FLAGS.data_type == "sup":
        sup_out_dir = FLAGS.output_base_dir
        tf.logging.info("Create sup. data: subset {} => {}".format(
            FLAGS.sub_set, sup_out_dir))

        proc_and_save_sup_data(
            processor,
            FLAGS.sub_set,
            FLAGS.raw_data_dir,
            sup_out_dir,
            tokenizer,
            FLAGS.max_seq_length,
            FLAGS.trunc_keep_right,
            FLAGS.worker_id,
            FLAGS.replicas,
            FLAGS.sup_size,
        )
    elif FLAGS.data_type == "unsup":
        assert FLAGS.aug_ops is not None, \
            "aug_ops is required to preprocess unsupervised data."
        unsup_out_dir = os.path.join(FLAGS.output_base_dir, FLAGS.aug_ops,
                                     str(FLAGS.aug_copy_num))
        data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats")

        tf.logging.info("Create unsup. data: subset {} => {}".format(
            FLAGS.sub_set, unsup_out_dir))
        proc_and_save_unsup_data(processor, FLAGS.sub_set, FLAGS.raw_data_dir,
                                 data_stats_dir, unsup_out_dir, tokenizer,
                                 FLAGS.max_seq_length, FLAGS.trunc_keep_right,
                                 FLAGS.aug_ops, FLAGS.aug_copy_num,
                                 FLAGS.worker_id, FLAGS.replicas,
                                 FLAGS.input_file)

Esempio n. 12

0

Mostra file

File: prepare_data.py Progetto: xuan20065/texar

def prepare_data():
    """
    Builds the model and runs.
    """
    # Loads data
    tf.logging.info("Loading data")

    task_datasets_rename = {
        "COLA": "CoLA",
        "SST": "SST-2",
    }

    data_dir = 'data/{}'.format(FLAGS.task)
    if FLAGS.task.upper() in task_datasets_rename:
        data_dir = 'data/{}'.format(task_datasets_rename[FLAGS.task])

    if FLAGS.tfrecords_output_dir is None:
        tfrecords_output_dir = data_dir
    else:
        tfrecords_output_dir = FLAGS.tfrecords_output_dir
    tx.utils.maybe_create_dir(tfrecords_output_dir)

    processors = {
        "COLA": data_utils.ColaProcessor,
        "MNLI": data_utils.MnliProcessor,
        "MRPC": data_utils.MrpcProcessor,
        "XNLI": data_utils.XnliProcessor,
        'SST': data_utils.SSTProcessor
    }
    processor = processors[FLAGS.task]()

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(data_dir))
    tf.logging.info('num_classes:%d; num_train_data:%d' %
                    (num_classes, num_train_data))
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # Produces TFRecords files
    data_utils.prepare_TFRecord_data(processor=processor,
                                     tokenizer=tokenizer,
                                     data_dir=data_dir,
                                     max_seq_length=FLAGS.max_seq_length,
                                     output_dir=tfrecords_output_dir)
    modify_config_data(FLAGS.max_seq_length, num_train_data, num_classes)

Esempio n. 13

0

Mostra file

def process_unsgetext(text: str, vocab_file, do_lower_case=True):
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    tokens_ = tokenizer.tokenize(text)
    if len(text) + 2 > seq_length:
        tokens_ = tokens_[:seq_length - 2]
    tokens = ["[CLS]"] + tokens_ + ["[SEP]"]
    n = len(tokens)
    seg_ids = [0] * n
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * n
    if n < seq_length:
        seg_ids = seg_ids + [0] * (seq_length - n)
        input_ids = input_ids + [0] * (seq_length - n)
        input_mask = input_mask + [0] * (seq_length - n)
    assert len(seg_ids) == seq_length and len(input_ids) == seq_length and len(
        input_mask) == seq_length
    return InputFeature(input_ids, input_mask, seg_ids)

Esempio n. 14

0

Mostra file

def file_based_convert_examples_to_features(
    examples, label_list, max_seq_length, tokenize_fn, num_passes=1,
    data_stats=None, aug_ops=None):
  """
  This is the function used to preprocess data for XLNet.
  It convert a set of `InputExample`s to a TFRecord file.
  """


  if num_passes > 1:
    examples *= num_passes

  if FLAGS.xlnet == True and aug_ops:
    logging.info("XLNet Model")
    examples = tokenize_examples(
                 examples, tokenization.FullTokenizer(do_lower_case=False))
    logging.info("building vocab")
    word_vocab = build_vocab(examples)
    logging.info("augmenting data using {}".format(aug_ops))
    examples = word_level_augment.word_level_augment(
      examples, aug_ops, word_vocab, data_stats
    )

  features = []
  for (ex_index, example) in enumerate(examples):
    if ex_index % 5000 == 0:
      tf.logging.info("Writing example {} of {}".format(ex_index,
                                                        len(examples)))

    feature = convert_single_example(ex_index, example, label_list,
                                     max_seq_length, tokenize_fn, aug_ops)


    features.append(
       InputFeaturesXL(
           input_ids=feature.input_ids,
           input_mask=feature.input_mask,
           segment_ids=feature.segment_ids,
           label_ids=feature.label_id,
           is_real_example=int(feature.is_real_example)))

  return features

Esempio n. 15

0

Mostra file

File: tokenization_test.py Progetto: huhanGitHub/utils

    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            if six.PY2:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
            else:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                            ]).encode("utf-8"))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertAllEqual(tokens,
                            ["un", "##want", "##ed", ",", "runn", "##ing"])

        self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens),
                            [7, 4, 5, 10, 8, 9])

Esempio n. 16

0

Mostra file

use_one_hot_embeddings = False
params_senteval['classifier'] = {
    'nhid': nhid,
    'optim': 'adam',
    'batch_size': 64,
    'tenacity': 5,
    'epoch_size': 4
}

tf.logging.set_verbosity(tf.logging.INFO)

layer_indexes = [layers]

bert_config = modeling.BertConfig.from_json_file(bert_config_file)

tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    master=master,
    tpu_config=tf.contrib.tpu.TPUConfig(
        num_shards=num_tpu_cores, per_host_input_for_training=is_per_host))
#####bert


class InputExample(object):
    def __init__(self, unique_id, text_a, text_b):
        self.unique_id = unique_id
        self.text_a = text_a
        self.text_b = text_b

Esempio n. 17

0

Mostra file

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

Esempio n. 18

0

Mostra file

def main(_):


  if FLAGS.max_seq_length > 512:
    raise ValueError(
        "Cannot use sequence length {:d} because the BERT model "
        "was only trained up to sequence length {:d}".format(
            FLAGS.max_seq_length, 512))

  processor = raw_data_utils.get_processor(FLAGS.task_name)

  if FLAGS.xlnet == False:
    # Create tokenizer
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
  else:
    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)
    def tokenize_fn(text):
      text = preprocess_text(text, lower=False)
      return encode_ids(sp, text)

  if FLAGS.data_type == "sup":
    sup_out_dir = FLAGS.output_base_dir
    logging.info("Create sup. data: subset {} => {}".format(
        FLAGS.sub_set, sup_out_dir))
    if FLAGS.xlnet == True:

      proc_and_save_sup_data_xlnet(
          processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir,
          tokenize_fn, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
          FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size,
      )

    else:
      proc_and_save_sup_data(
          processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir,
          tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
          FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size,
      )
  elif FLAGS.data_type == "unsup":
    assert FLAGS.aug_ops is not None, \
        "aug_ops is required to preprocess unsupervised data."
    unsup_out_dir = os.path.join(
        FLAGS.output_base_dir,
        FLAGS.aug_ops,
        str(FLAGS.aug_copy_num))
    data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats")


    logging.info("Create unsup. data: subset {} => {}".format(
        FLAGS.sub_set, unsup_out_dir))
    if FLAGS.xlnet == True:
      proc_and_save_unsup_data_xlnet(
            processor, FLAGS.sub_set,
            FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir,
            tokenize_fn, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
            FLAGS.aug_ops, FLAGS.aug_copy_num,
            FLAGS.worker_id, FLAGS.replicas)
    else:
      proc_and_save_unsup_data(
            processor, FLAGS.sub_set,
            FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir,
            tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right,
            FLAGS.aug_ops, FLAGS.aug_copy_num,
            FLAGS.worker_id, FLAGS.replicas)

Esempio n. 19

0

Mostra file

    args.n_gpu = 1

    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')

# Log GPU information
logger.add_text('info', f"args: {args}")

# Modify batch size if accumulating gradients
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

# Reproducibility
utils.set_seeds(args.seed, multi_gpu=args.n_gpu > 0)

# Build dataloaders
tokenizer = tokenization.FullTokenizer(args.vocab,
                                       do_lower_case=args.do_lower_case)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))
pipeline = [
    PipelineForPretrain(
        max_pred=20,  # what is this?
        mask_prob=0.15,  # actually this does nothing
        vocab_words=list(tokenizer.vocab.keys()),  # 
        indexer=tokenizer.convert_tokens_to_ids,
        max_len=args.max_seq_length)
]
dataloader = SentencePairDataLoader(args.text_file,
                                    batch_size=args.train_batch_size,
                                    tokenize=tokenize,
                                    max_len=args.max_seq_length,
                                    pipeline=pipeline)

Esempio n. 20

0

Mostra file

File: read_mrpc.py Progetto: Qksidmx/FasterTransformer-PA

                        help='sequence length (default: 32)')
    parser.add_argument('--input_file',
                        type=str,
                        default="",
                        metavar='STRING',
                        help='input file path')
    parser.add_argument('--vocab_file',
                        type=str,
                        default="",
                        metavar='STRING',
                        help='vocab file path')
    args = parser.parse_args()

    input_file = args.input_file
    max_seq_length = args.max_seq_length
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    with tf.gfile.Open(input_file, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        for line in reader:
            text_a = tokenization.convert_to_unicode(line[3])
            text_b = tokenization.convert_to_unicode(line[4])

            a_input_ids, a_input_mask, a_segment_ids = convert_single_example(
                text_a,
                None,
                max_seq_length=max_seq_length,
                tokenizer=tokenizer)
            b_input_ids, b_input_mask, b_segment_ids = convert_single_example(
                text_b,
                None,
                max_seq_length=max_seq_length,

Esempio n. 21

0

Mostra file

 def __init__(self, vocab_file, do_lower_case, max_seq_len):
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=do_lower_case)
     self.max_seq_len = max_seq_len

Esempio n. 22

0

Mostra file

File: preprocess.py Progetto: TSLNIHAOGIT/QANet_keras_debug

       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'y_end': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)}
    '''
    with open(save_path, 'wb') as f:
        pickle.dump(meta, f)


if __name__ == '__main__':

    # Load tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file='glove.42B.300d.txt1',
                                           do_lower_case=False)

    train_examples = read_squad_examples(
        input_file='original_data/train_sample.json', is_training=True)
    dev_examples = read_squad_examples(
        input_file='original_data/train_sample.json', is_training=False)

    train_features = convert_examples_to_features(train_examples,
                                                  tokenizer,
                                                  max_seq_length=400,
                                                  max_query_length=50,
                                                  is_training=True)
    dev_features = convert_examples_to_features(dev_examples,
                                                tokenizer,
                                                max_seq_length=400,
                                                max_query_length=50,

Esempio n. 23

0

Mostra file

File: pruning_classifier.py Progetto: wzb1005/TileSparsity

def define_train_eval_input_fn():

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mismnli": MisMnliProcessor,
        "mrpc": MrpcProcessor,
        "rte": RteProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "wnli": WnliProcessor,
    }
    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    train_examples = processor.get_train_examples(FLAGS.data_dir)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    if (tf.gfile.Exists(train_file) == False):
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    num_actual_eval_examples = len(eval_examples)

    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    if (tf.gfile.Exists(eval_file) == False):
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(eval_examples), num_actual_eval_examples,
                    len(eval_examples) - num_actual_eval_examples)
    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    # This tells the estimator to run through the entire set.
    eval_steps = None
    # However, if running eval on the TPU, you will need to specify the
    # number of steps.
    if FLAGS.use_tpu:
        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

    eval_drop_remainder = True if FLAGS.use_tpu else False
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder)

    return label_list, train_input_fn, num_train_steps, eval_input_fn, eval_steps, num_warmup_steps

Esempio n. 24

0

Mostra file

def main(_):
    hvd.init()
    FLAGS.model_dir = FLAGS.model_dir if hvd.rank() == 0 else os.path.join(
        FLAGS.model_dir, str(hvd.rank()))
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    #FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
    #FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()

    tf.logging.set_verbosity(tf.logging.INFO)

    processor = raw_data_utils.get_processor(FLAGS.task_name)
    label_list = processor.get_labels()

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file,
                                                     FLAGS.model_dropout)

    tf.gfile.MakeDirs(FLAGS.model_dir)

    flags_dict = tf.app.flags.FLAGS.flag_values_dict()
    with tf.gfile.Open(os.path.join(FLAGS.model_dir, "FLAGS.json"),
                       "w") as ouf:
        json.dump(flags_dict, ouf)

    tf.logging.info("warmup steps {}/{}".format(FLAGS.num_warmup_steps,
                                                FLAGS.num_train_steps))

    save_checkpoints_steps = 500  #FLAGS.num_train_steps // FLAGS.save_checkpoints_num
    tf.logging.info("setting save checkpoints steps to {:d}".format(
        save_checkpoints_steps))

    FLAGS.iterations_per_loop = min(save_checkpoints_steps,
                                    FLAGS.iterations_per_loop)
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None
    # if not FLAGS.use_tpu and FLAGS.num_gpu > 1:
    #   train_distribute = tf.contrib.distribute.MirroredStrategy(
    #       num_gpus=FLAGS.num_gpu)
    # else:
    #   train_distribute = None

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=1,
        # train_distribute=train_distribute,
        session_config=config,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=is_per_host))

    model_fn = uda.model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        clip_norm=FLAGS.clip_norm,
        num_train_steps=FLAGS.num_train_steps,
        num_warmup_steps=FLAGS.num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings,
        num_labels=len(label_list),
        unsup_ratio=FLAGS.unsup_ratio,
        uda_coeff=FLAGS.uda_coeff,
        tsa=FLAGS.tsa,
        print_feature=False,
        print_structure=False,
    )

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        params={"model_dir": FLAGS.model_dir},
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("  >>> sup data dir : {}".format(
            FLAGS.sup_train_data_dir))
        if FLAGS.unsup_ratio > 0:
            tf.logging.info("  >>> unsup data dir : {}".format(
                FLAGS.unsup_data_dir))

        train_input_fn = proc_data_utils.training_input_fn_builder(
            FLAGS.sup_train_data_dir, FLAGS.unsup_data_dir, FLAGS.aug_ops,
            FLAGS.aug_copy, FLAGS.unsup_ratio)
        train_size = processor.get_train_size(FLAGS.raw_data_dir)
        train_steps = int(train_size / FLAGS.train_batch_size)

    if FLAGS.do_eval:
        tf.logging.info("  >>> dev data dir : {}".format(FLAGS.eval_data_dir))
        eval_input_fn = proc_data_utils.evaluation_input_fn_builder(
            FLAGS.eval_data_dir, "clas")

        eval_size = processor.get_dev_size(FLAGS.raw_data_dir)
        eval_steps = int(eval_size / FLAGS.eval_batch_size)

        train_eval_input_fn = proc_data_utils.evaluation_input_fn_builder(
            FLAGS.sup_train_data_dir, "clas")

    if FLAGS.do_train and FLAGS.do_eval:
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        tf.logging.info("***** Running training & evaluation *****")
        tf.logging.info("  Supervised batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Unsupervised batch size = %d",
                        FLAGS.train_batch_size * FLAGS.unsup_ratio)
        tf.logging.info("  training size = %d", train_size)
        tf.logging.info("  training num steps = %d", train_steps)
        tf.logging.info("  evaluation batch size = %d", FLAGS.eval_batch_size)
        tf.logging.info("  dev num steps = %d", eval_steps)
        best_acc = 0
        for _ in range(0, FLAGS.num_train_steps, save_checkpoints_steps):
            tf.logging.info("*** Running training ***")
            estimator.train(input_fn=train_input_fn,
                            steps=save_checkpoints_steps,
                            hooks=hooks)
            tf.logging.info("*** Running evaluation ***")

            train_result = estimator.evaluate(input_fn=train_eval_input_fn,
                                              steps=train_steps)
            tf.logging.info(">> Train Results:")
            for key in train_result.keys():
                tf.logging.info("  %s = %s", key, str(train_result[key]))
                train_result[key] = train_result[key].item()
            dev_result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps)
            tf.logging.info(">> Results:")
            for key in dev_result.keys():
                tf.logging.info("  %s = %s", key, str(dev_result[key]))
                dev_result[key] = dev_result[key].item()
            best_acc = max(best_acc, dev_result["eval_precision"])
        tf.logging.info("***** Final evaluation result *****")
        tf.logging.info("Best acc: {:.3f}\n\n".format(best_acc))
    elif FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Supervised batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Unsupervised batch size = %d",
                        FLAGS.train_batch_size * FLAGS.unsup_ratio)
        tf.logging.info("  Num steps = %d", FLAGS.num_train_steps)
        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps)
    elif FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Base evaluation batch size = %d",
                        FLAGS.eval_batch_size)
        tf.logging.info("  Num steps = %d", eval_steps)
        checkpoint_state = tf.train.get_checkpoint_state(FLAGS.model_dir)

        best_acc = 0
        for ckpt_path in checkpoint_state.all_model_checkpoint_paths:
            if not tf.gfile.Exists(ckpt_path + ".data-00000-of-00001"):
                tf.logging.info(
                    "Warning: checkpoint {:s} does not exist".format(
                        ckpt_path))
                continue
            tf.logging.info("Evaluating {:s}".format(ckpt_path))
            dev_result = estimator.evaluate(
                input_fn=eval_input_fn,
                steps=eval_steps,
                checkpoint_path=ckpt_path,
            )
            tf.logging.info(">> Results:")
            for key in dev_result.keys():
                tf.logging.info("  %s = %s", key, str(dev_result[key]))
                dev_result[key] = dev_result[key].item()
            best_acc = max(best_acc, dev_result["eval_precision"])
        tf.logging.info("***** Final evaluation result *****")
        tf.logging.info("Best acc: {:.3f}\n\n".format(best_acc))
        from utils import tokenization
        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                               do_lower_case=True)
        id2label = dict(zip([i for i in range(len(label_list))], label_list))
        result = estimator.predict(input_fn=eval_input_fn)
        output_line = ""
        with open("label_test.txt", 'w') as writer:
            for re in result:
                sentence = re["input_ids"]
                gold = re["label_ids"]
                prediction = re["predict"]
                # output_line = "\n".join(id2label[id] for id in prediction if id!=0) + "\n"
                for gold_index, gold_item in enumerate(gold):
                    if gold_item >= 34:
                        gold[gold_index] = 0
                for gold_index, gold_item in enumerate(prediction):
                    if gold_item >= 34:
                        gold[gold_index] = 0
                for w, gold_label, label in zip(
                        tokenizer.convert_ids_to_tokens([
                            int(s) for s in sentence
                        ]), [id2label[id] for id in gold],
                    [id2label[id] for id in prediction]):
                    if w == "[PAD]":
                        continue
                    #if label=="NEGATIVE":
                    #  continue
                    output_line = output_line + w + " " + gold_label + " " + label + "\n"
                output_line += "\n"
            writer.write(output_line)

Esempio n. 25

0

Mostra file

def main(_):
    """
    Builds the model and runs.
    """
    tf.logging.set_verbosity(tf.logging.INFO)
    tx.utils.maybe_create_dir(FLAGS.output_dir)
    bert_pretrain_dir = 'bert_pretrained_models/%s' % FLAGS.config_bert_pretrain

    # Loads BERT model configuration
    if FLAGS.config_format_bert == "json":
        bert_config = model_utils.transform_bert_to_texar_config(
            os.path.join(bert_pretrain_dir, 'bert_config.json'))
    elif FLAGS.config_format_bert == 'texar':
        bert_config = importlib.import_module(
            'bert_config_lib.config_model_%s' % FLAGS.config_bert_pretrain)
    else:
        raise ValueError('Unknown config_format_bert.')

    # Loads data
    processors = {
        "cola": data_utils.ColaProcessor,
        "mnli": data_utils.MnliProcessor,
        "mrpc": data_utils.MrpcProcessor,
        "xnli": data_utils.XnliProcessor,
        'sst': data_utils.SSTProcessor
    }

    processor = processors[FLAGS.task.lower()]()

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(config_data.data_dir))

    tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(
        bert_pretrain_dir, 'vocab.txt'),
                                           do_lower_case=FLAGS.do_lower_case)

    train_dataset = data_utils.get_dataset(processor,
                                           tokenizer,
                                           config_data.data_dir,
                                           config_data.max_seq_length,
                                           config_data.train_batch_size,
                                           mode='train',
                                           output_dir=FLAGS.output_dir)
    eval_dataset = data_utils.get_dataset(processor,
                                          tokenizer,
                                          config_data.data_dir,
                                          config_data.max_seq_length,
                                          config_data.eval_batch_size,
                                          mode='eval',
                                          output_dir=FLAGS.output_dir)
    test_dataset = data_utils.get_dataset(processor,
                                          tokenizer,
                                          config_data.data_dir,
                                          config_data.max_seq_length,
                                          config_data.test_batch_size,
                                          mode='test',
                                          output_dir=FLAGS.output_dir)

    iterator = tx.data.FeedableDataIterator({
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_dataset
    })
    batch = iterator.get_next()
    input_ids = batch["input_ids"]
    segment_ids = batch["segment_ids"]
    batch_size = tf.shape(input_ids)[0]
    input_length = tf.reduce_sum(1 - tf.to_int32(tf.equal(input_ids, 0)),
                                 axis=1)

    # Builds BERT
    with tf.variable_scope('bert'):
        embedder = tx.modules.WordEmbedder(vocab_size=bert_config.vocab_size,
                                           hparams=bert_config.embed)
        word_embeds = embedder(input_ids)

        # Creates segment embeddings for each type of tokens.
        segment_embedder = tx.modules.WordEmbedder(
            vocab_size=bert_config.type_vocab_size,
            hparams=bert_config.segment_embed)
        segment_embeds = segment_embedder(segment_ids)

        input_embeds = word_embeds + segment_embeds

        # The BERT model (a TransformerEncoder)
        encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder)
        output = encoder(input_embeds, input_length)

        # Builds layers for downstream classification, which is also initialized
        # with BERT pre-trained checkpoint.
        with tf.variable_scope("pooler"):
            # Uses the projection of the 1st-step hidden vector of BERT output
            # as the representation of the sentence
            bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1)
            bert_sent_output = tf.layers.dense(bert_sent_hidden,
                                               config_downstream.hidden_dim,
                                               activation=tf.tanh)
            output = tf.layers.dropout(bert_sent_output,
                                       rate=0.1,
                                       training=tx.global_mode_train())

    # Adds the final classification layer
    logits = tf.layers.dense(
        output,
        num_classes,
        kernel_initializer=tf.truncated_normal_initializer(stddev=0.02))
    preds = tf.argmax(logits, axis=-1, output_type=tf.int32)
    accu = tx.evals.accuracy(batch['label_ids'], preds)

    # Optimization

    loss = tf.losses.sparse_softmax_cross_entropy(labels=batch["label_ids"],
                                                  logits=logits)
    global_step = tf.Variable(0, trainable=False)

    # Builds learning rate decay scheduler
    static_lr = config_downstream.lr['static_lr']
    num_train_steps = int(num_train_data / config_data.train_batch_size *
                          config_data.max_train_epoch)
    num_warmup_steps = int(num_train_steps * config_data.warmup_proportion)
    lr = model_utils.get_lr(
        global_step,
        num_train_steps,  # lr is a Tensor
        num_warmup_steps,
        static_lr)

    train_op = tx.core.get_train_op(loss,
                                    global_step=global_step,
                                    learning_rate=lr,
                                    hparams=config_downstream.opt)

    # Train/eval/test routine

    def _run(sess, mode):
        fetches = {
            'accu': accu,
            'batch_size': batch_size,
            'step': global_step,
            'loss': loss,
        }

        if mode == 'train':
            fetches['train_op'] = train_op
            while True:
                try:
                    feed_dict = {
                        iterator.handle: iterator.get_handle(sess, 'train'),
                        tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
                    }
                    rets = sess.run(fetches, feed_dict)
                    if rets['step'] % 50 == 0:
                        tf.logging.info('step:%d loss:%f' %
                                        (rets['step'], rets['loss']))
                    if rets['step'] == num_train_steps:
                        break
                except tf.errors.OutOfRangeError:
                    break

        if mode == 'eval':
            cum_acc = 0.0
            nsamples = 0
            while True:
                try:
                    feed_dict = {
                        iterator.handle: iterator.get_handle(sess, 'eval'),
                        tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
                    }
                    rets = sess.run(fetches, feed_dict)

                    cum_acc += rets['accu'] * rets['batch_size']
                    nsamples += rets['batch_size']
                except tf.errors.OutOfRangeError:
                    break

            tf.logging.info('dev accu: {}'.format(cum_acc / nsamples))

        if mode == 'test':
            _all_preds = []
            while True:
                try:
                    feed_dict = {
                        iterator.handle: iterator.get_handle(sess, 'test'),
                        tx.context.global_mode():
                        tf.estimator.ModeKeys.PREDICT,
                    }
                    _preds = sess.run(preds, feed_dict=feed_dict)
                    _all_preds.extend(_preds.tolist())
                except tf.errors.OutOfRangeError:
                    break

            output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
            with tf.gfile.GFile(output_file, "w") as writer:
                writer.write('\n'.join(str(p) for p in _all_preds))

    with tf.Session() as sess:
        # Loads pretrained BERT model parameters
        init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt')
        model_utils.init_bert_checkpoint(init_checkpoint)

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        # Restores trained model if specified
        saver = tf.train.Saver()
        if FLAGS.checkpoint:
            saver.restore(sess, FLAGS.checkpoint)

        iterator.initialize_dataset(sess)

        if FLAGS.do_train:
            iterator.restart_dataset(sess, 'train')
            _run(sess, mode='train')
            saver.save(sess, FLAGS.output_dir + '/model.ckpt')

        if FLAGS.do_eval:
            iterator.restart_dataset(sess, 'eval')
            _run(sess, mode='eval')

        if FLAGS.do_test:
            iterator.restart_dataset(sess, 'test')
            _run(sess, mode='test')

Esempio n. 26

0

Mostra file

File: data_helper_bert.py Progetto: qianrenjian/Workspace-of-NLU

#!/usr/bin/python3

Esempio n. 27

0

Mostra file

                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(
                value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_id"] = create_int_feature([feature.label_id])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()


if __name__ == '__main__':
    max_seq_length = 64
    tokenizer = tokenization.FullTokenizer(
        "/home/geb/PycharmProjects/bert_ngc/vocab_file/albert_zh/vocab.txt")
    sc = SentenceClassifierProcessor()
    examples = sc.get_train_examples("/home/geb/PycharmProjects/bert/data_dir")
    file_based_convert_examples_to_features(
        examples, sc.get_labels(), max_seq_length, tokenizer,
        "../tf_records/sentence_classifier/train.record0")

Esempio n. 28

0

Mostra file

File: preprocess.py Progetto: zhihongcheng/QANet_keras

        'context_id': context_idxss,
        'question_id': ques_idxss,
        'context_char_id': context_char_idxss,
        'question_char_id': ques_char_idxss,
        'y_start': y1s,
        'y_end': y2s
    }
    print('save to', save_path, len(qids), 'features')
    with open(save_path, 'wb') as f:
        pickle.dump(meta, f)


if __name__ == '__main__':

    # Load tokenizer
    tokenizer = tokenization.FullTokenizer(
        vocab_file='original_data/glove.840B.300d.txt', do_lower_case=False)
    train_examples = read_squad_examples(
        input_file='original_data/train-v1.1.json', is_training=True)
    dev_examples = read_squad_examples(
        input_file='original_data/dev-v1.1.json', is_training=False)

    train_features = convert_examples_to_features(train_examples,
                                                  tokenizer,
                                                  max_seq_length=400,
                                                  max_query_length=50,
                                                  is_training=True)
    dev_features = convert_examples_to_features(dev_examples,
                                                tokenizer,
                                                max_seq_length=400,
                                                max_query_length=50,
                                                is_training=False)

Esempio n. 29

0

Mostra file

def proc_and_save_unsup_data_xlnet(
    processor, sub_set,
    raw_data_dir, data_stats_dir, unsup_out_dir,
    tokenize_fn,
    max_seq_length, trunc_keep_right,
    aug_ops, aug_copy_num,
    worker_id, replicas):
  # print random seed just to double check that we use different random seeds
  # for different runs so that we generate different augmented examples for the same original example.
  random_seed = np.random.randint(0, 100000)
  logging.info("random seed: {:d}".format(random_seed))
  np.random.seed(random_seed)
  logging.info("getting examples")

  if sub_set == "train":
    ori_examples = processor.get_train_examples(raw_data_dir)
  elif sub_set.startswith("unsup"):
    print(sub_set)
    ori_examples = processor.get_unsup_examples(raw_data_dir, sub_set)
  else:
    assert False
  # this is the size before spliting data for each worker
  data_total_size = len(ori_examples)
  if replicas != -1:
    ori_examples, start, end = get_data_for_worker(
        ori_examples, replicas, worker_id)
  else:
    start = 0
    end = len(ori_examples)

  logging.info("getting augmented examples")
  aug_examples = copy.deepcopy(ori_examples)

  # Doesn't do anything for tf-idf augmentation
  aug_examples = sent_level_augment.run_augment(
      aug_examples, aug_ops, sub_set,
      aug_copy_num,
      start, end, data_total_size)

  labels = processor.get_labels() + ["unsup"]
  logging.info("processing ori examples with labels: {}".format(labels))

  ori_features = file_based_convert_examples_to_features(
      ori_examples, labels, max_seq_length,
      tokenize_fn, num_passes=1)

  tokenized_ori_examples = tokenize_examples(
               ori_examples, tokenization.FullTokenizer(do_lower_case=False))

  if "idf" in aug_ops:
    data_stats = get_data_stats(
        data_stats_dir, sub_set,
        -1, replicas, tokenized_ori_examples)
  else:
    data_stats = None

  logging.info("processing aug examples using aug ops {}".format(aug_ops))

  aug_features = file_based_convert_examples_to_features(
      aug_examples, labels, max_seq_length,
      tokenize_fn, num_passes=1, data_stats=data_stats, aug_ops=aug_ops)

  logging.info("{} Original Features".format(len(ori_features)))
  logging.info("{} Augmented Features".format(len(aug_features)))
  unsup_features = []

  for ori_feat, aug_feat in zip(ori_features, aug_features):
    unsup_features.append(PairedUnsupInputFeaturesXL(
        ori_feat.input_ids,
        ori_feat.input_mask,
        ori_feat.segment_ids,
        ori_feat.is_real_example,
        aug_feat.input_ids,
        aug_feat.input_mask,
        aug_feat.segment_ids,
        aug_feat.is_real_example
        ))
  logging.info("There are {} total unsupervised records".format(len(unsup_features)))
  dump_tfrecord(unsup_features, unsup_out_dir, worker_id)

Esempio n. 30

0

Mostra file

File: inference.py Progetto: BITprogramMan/taac2021-tagging

import sys, os
sys.path.append(os.getcwd())
import glob
import tensorflow.compat.v1 as tf
import numpy as np
import cv2
import argparse
import time
import traceback
import json
import utils.tokenization as tokenization
from utils.train_util import get_label_name_dict
from src.feats_extract.multimodal_feature_extract import MultiModalFeatureExtract

#################Inference Utils#################
tokenizer = tokenization.FullTokenizer(
    vocab_file='pretrain_models/robert/chinese_L-12_H-768_A-12/vocab.txt')


class TaggingModel():
    def __init__(self, configs):
        tag_id_file = configs.get('tag_id_file', None)
        model_pb = configs.get('model_pb', None)
        if tag_id_file is None:
            raise
        else:
            self.label_name_dict = get_label_name_dict(tag_id_file, None)
        if model_pb is None:
            raise
        else:
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True