def __init__(self, args): self._tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_path, do_lower_case=args.do_lower_case) self._max_seq_length = args.max_seq_len self._doc_stride = args.doc_stride self._max_query_length = args.max_query_length self._in_tokens = args.in_tokens self._train_file = args.train_file self._predict_file = args.predict_file self._batch_size = args.batch_size self._with_negative = args.with_negative self._epoch = args.epoch self._sample_rate = args.sample_rate self.vocab = self._tokenizer.vocab self.vocab_size = len(self.vocab) self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] self.mask_id = self.vocab["[MASK]"] self.current_train_example = -1 self.num_train_examples = -1 self.current_train_epoch = -1 self.train_examples = None self.predict_examples = None self.predict_features = None self.num_examples = {'train': -1, 'predict': -1}
def __init__(self, cfg): self.cfg = cfg self.TaskDataset = dataset_class(cfg.task) self.pipeline = None if cfg.need_prepro: tokenizer = tokenization.FullTokenizer( vocab_file=cfg.vocab, do_lower_case=cfg.do_lower_case) self.pipeline = [ Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(cfg.max_seq_length), TokenIndexing(tokenizer.convert_tokens_to_ids, self.TaskDataset.labels, cfg.max_seq_length) ] if cfg.mode == 'train': self.sup_data_dir = cfg.sup_data_dir self.sup_batch_size = cfg.train_batch_size self.shuffle = True elif cfg.mode == 'train_eval': self.sup_data_dir = cfg.sup_data_dir self.eval_data_dir = cfg.eval_data_dir self.sup_batch_size = cfg.train_batch_size self.eval_batch_size = cfg.eval_batch_size self.shuffle = True elif cfg.mode == 'eval': self.sup_data_dir = cfg.eval_data_dir self.sup_batch_size = cfg.eval_batch_size self.shuffle = False # Not shuffel when eval mode if cfg.uda_mode: # Only uda_mode self.unsup_data_dir = cfg.unsup_data_dir self.unsup_batch_size = cfg.train_batch_size * cfg.unsup_ratio
def predict_v2(): """ 加载pb常量模型 :return: """ VOCAB_PATH_HZ = '/home/recsys/jixiaozhan/sansu_detect_bert/modelParams/chinese_L-12_H-768_A-12/vocab.txt' title = "hide new secretions from the parental units" model_file = "/home/jixiaozhan/EasyTransfer/scripts/knowledge_distillation/vanilla_teacher_model/tmp_model/saved_model.pb" tokenizer_hz = tokenization.FullTokenizer(vocab_file=VOCAB_PATH_HZ, do_lower_case=True) sess = tf.Session() sess.run(tf.global_variables_initializer()) with gfile.FastGFile(model_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) sess.graph.as_default() tf.import_graph_def(graph_def, name='') input_ids = sess.graph.get_tensor_by_name("input_ids:0") input_mask = sess.graph.get_tensor_by_name("input_mask:0") segment_ids = sess.graph.get_tensor_by_name("segment_ids:0") predictions = sess.graph.get_tensor_by_name('app/ez_dense/BiasAdd:0')[0] example = bert_33.get_input_features(title, tokenizer_hz) ret = sess.run(predictions, feed_dict={ input_ids: np.array(example['input_ids']), input_mask: np.array(example['input_mask']), segment_ids: np.array(example['segment_ids']) })
def build_tokenizer(self, bert_layer): ''' Encodees text into tokens, masks, and segment flags :return: tokenization wrapper ''' vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() return tokenization.FullTokenizer(vocab_file, do_lower_case)
def main(): """ Starts the data preparation """ # Loads data logging.info("Loading data") task_datasets_rename = { "COLA": "CoLA", "SST": "SST-2", } data_dir = f'data/{args.task}' if args.task.upper() in task_datasets_rename: data_dir = f'data/{task_datasets_rename[args.task]}' if args.output_dir is None: output_dir = data_dir else: output_dir = args.output_dir tx.utils.maybe_create_dir(output_dir) processors = { "COLA": data_utils.ColaProcessor, "MNLI": data_utils.MnliProcessor, "MRPC": data_utils.MrpcProcessor, "XNLI": data_utils.XnliProcessor, 'SST': data_utils.SSTProcessor } processor = processors[args.task]() config_data = importlib.import_module(args.config_data) pretrained_model_dir = tx.modules.BERTEncoder.download_checkpoint( pretrained_model_name=args.pretrained_model_name) vocab_file = os.path.join(pretrained_model_dir, "vocab.txt") num_classes = len(processor.get_labels()) num_train_data = len(processor.get_train_examples(data_dir)) logging.info("num_classes: %d; num_train_data: %d", num_classes, num_train_data) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=args.lower_case) # Produces pickled files data_utils.prepare_record_data( processor=processor, tokenizer=tokenizer, data_dir=data_dir, max_seq_length=args.max_seq_length, output_dir=output_dir, feature_original_types=config_data.feature_original_types) modify_config_data(args.max_seq_length, num_train_data, num_classes)
def __init__(self, vocab_path, label_map_config=None, max_seq_len=512, max_ent_cnt=42, do_lower_case=True, in_tokens=False, is_inference=False, random_seed=None, tokenizer="FullTokenizer", is_classify=True, is_regression=False, for_cn=True, task_id=0): self.max_seq_len = max_seq_len self.max_ent_cnt = max_ent_cnt self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) self.vocab = self.tokenizer.vocab self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] self.in_tokens = in_tokens self.is_inference = is_inference self.for_cn = for_cn self.task_id = task_id np.random.seed(random_seed) self.is_classify = is_classify self.is_regression = is_regression self.current_example = 0 self.current_epoch = 0 self.num_examples = 0 if label_map_config: with open(label_map_config, encoding='utf8') as f: self.label_map = json.load(f) else: self.label_map = None self.ner_map = {'PAD': 0, 'ORG': 1, 'LOC': 2, 'NUM': 3, 'TIME': 4, 'MISC': 5, 'PER': 6} distance_buckets = np.zeros((512), dtype='int64') distance_buckets[1] = 1 distance_buckets[2:] = 2 distance_buckets[4:] = 3 distance_buckets[8:] = 4 distance_buckets[16:] = 5 distance_buckets[32:] = 6 distance_buckets[64:] = 7 distance_buckets[128:] = 8 distance_buckets[256:] = 9 self.distance_buckets = distance_buckets
def create_tokenizer_from_hub_module(bert_hub_module_handle): """Get the vocab file and casing info from the Hub module.""" with tf.Graph().as_default(): bert_module = hub.Module(bert_hub_module_handle) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"] ]) return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def __init__(self, args): self.train_file = args.train_file self.max_seq_len = args.max_seq_len self.batch_size = args.batch_size self.epoch = args.epoch self.tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_path, do_lower_case=args.do_lower_case) self.vocab = self.tokenizer.vocab self.in_tokens = args.in_tokens self.current_train_example = -1 self.num_examples = {'train': -1, 'dev': -1, 'test': -1} self.current_train_epoch = -1
def prepare_data(): """ Builds the model and runs. """ # Loads data logging.info("Loading data") task_datasets_rename = { "COLA": "CoLA", "SST": "SST-2", } data_dir = f'data/{args.task}' if args.task.upper() in task_datasets_rename: data_dir = f'data/{task_datasets_rename[args.task]}' if args.tfrecord_output_dir is None: tfrecord_output_dir = data_dir else: tfrecord_output_dir = args.tfrecord_output_dir tx.utils.maybe_create_dir(tfrecord_output_dir) processors = { "COLA": data_utils.ColaProcessor, "MNLI": data_utils.MnliProcessor, "MRPC": data_utils.MrpcProcessor, "XNLI": data_utils.XnliProcessor, 'SST': data_utils.SSTProcessor } processor = processors[args.task]() from config_data import feature_original_types num_classes = len(processor.get_labels()) num_train_data = len(processor.get_train_examples(data_dir)) logging.info("num_classes: %d; num_train_data: %d", num_classes, num_train_data) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # Produces TFRecord files data_utils.prepare_record_data( processor=processor, tokenizer=tokenizer, data_dir=data_dir, max_seq_length=args.max_seq_length, output_dir=tfrecord_output_dir, feature_original_types=feature_original_types) modify_config_data(args.max_seq_length, num_train_data, num_classes)
def predict(): """ 加载pb变量模型 :return: """ VOCAB_PATH_HZ = '/home/recsys/jixiaozhan/sansu_detect_bert/modelParams/chinese_L-12_H-768_A-12/vocab.txt' title = "hide new secretions from the parental units" MODEL_V2 = "/home/jixiaozhan/EasyTransfer/scripts/knowledge_distillation/vanilla_teacher_model/1607392874" tokenizer_hz = tokenization.FullTokenizer(vocab_file=VOCAB_PATH_HZ, do_lower_case=True) example = bert_33.get_input_features(title, tokenizer_hz) label_id = example.pop('label_ids') example['label_id'] = label_id predict_fn_hz_v2 = tf.contrib.predictor.from_saved_model(MODEL_V2) predict_pro_list2 = predict_fn_hz_v2(example)
def main(_): if FLAGS.max_seq_length > 512: raise ValueError( "Cannot use sequence length {:d} because the BERT model " "was only trained up to sequence length {:d}".format( FLAGS.max_seq_length, 512)) processor = raw_data_utils.get_processor(FLAGS.task_name) # Create tokenizer tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) if FLAGS.data_type == "sup": sup_out_dir = FLAGS.output_base_dir tf.logging.info("Create sup. data: subset {} => {}".format( FLAGS.sub_set, sup_out_dir)) proc_and_save_sup_data( processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir, tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size, ) elif FLAGS.data_type == "unsup": assert FLAGS.aug_ops is not None, \ "aug_ops is required to preprocess unsupervised data." unsup_out_dir = os.path.join(FLAGS.output_base_dir, FLAGS.aug_ops, str(FLAGS.aug_copy_num)) data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats") tf.logging.info("Create unsup. data: subset {} => {}".format( FLAGS.sub_set, unsup_out_dir)) proc_and_save_unsup_data(processor, FLAGS.sub_set, FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir, tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.aug_ops, FLAGS.aug_copy_num, FLAGS.worker_id, FLAGS.replicas, FLAGS.input_file)
def prepare_data(): """ Builds the model and runs. """ # Loads data tf.logging.info("Loading data") task_datasets_rename = { "COLA": "CoLA", "SST": "SST-2", } data_dir = 'data/{}'.format(FLAGS.task) if FLAGS.task.upper() in task_datasets_rename: data_dir = 'data/{}'.format(task_datasets_rename[FLAGS.task]) if FLAGS.tfrecords_output_dir is None: tfrecords_output_dir = data_dir else: tfrecords_output_dir = FLAGS.tfrecords_output_dir tx.utils.maybe_create_dir(tfrecords_output_dir) processors = { "COLA": data_utils.ColaProcessor, "MNLI": data_utils.MnliProcessor, "MRPC": data_utils.MrpcProcessor, "XNLI": data_utils.XnliProcessor, 'SST': data_utils.SSTProcessor } processor = processors[FLAGS.task]() num_classes = len(processor.get_labels()) num_train_data = len(processor.get_train_examples(data_dir)) tf.logging.info('num_classes:%d; num_train_data:%d' % (num_classes, num_train_data)) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # Produces TFRecords files data_utils.prepare_TFRecord_data(processor=processor, tokenizer=tokenizer, data_dir=data_dir, max_seq_length=FLAGS.max_seq_length, output_dir=tfrecords_output_dir) modify_config_data(FLAGS.max_seq_length, num_train_data, num_classes)
def process_unsgetext(text: str, vocab_file, do_lower_case=True): tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) tokens_ = tokenizer.tokenize(text) if len(text) + 2 > seq_length: tokens_ = tokens_[:seq_length - 2] tokens = ["[CLS]"] + tokens_ + ["[SEP]"] n = len(tokens) seg_ids = [0] * n input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * n if n < seq_length: seg_ids = seg_ids + [0] * (seq_length - n) input_ids = input_ids + [0] * (seq_length - n) input_mask = input_mask + [0] * (seq_length - n) assert len(seg_ids) == seq_length and len(input_ids) == seq_length and len( input_mask) == seq_length return InputFeature(input_ids, input_mask, seg_ids)
def file_based_convert_examples_to_features( examples, label_list, max_seq_length, tokenize_fn, num_passes=1, data_stats=None, aug_ops=None): """ This is the function used to preprocess data for XLNet. It convert a set of `InputExample`s to a TFRecord file. """ if num_passes > 1: examples *= num_passes if FLAGS.xlnet == True and aug_ops: logging.info("XLNet Model") examples = tokenize_examples( examples, tokenization.FullTokenizer(do_lower_case=False)) logging.info("building vocab") word_vocab = build_vocab(examples) logging.info("augmenting data using {}".format(aug_ops)) examples = word_level_augment.word_level_augment( examples, aug_ops, word_vocab, data_stats ) features = [] for (ex_index, example) in enumerate(examples): if ex_index % 5000 == 0: tf.logging.info("Writing example {} of {}".format(ex_index, len(examples))) feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenize_fn, aug_ops) features.append( InputFeaturesXL( input_ids=feature.input_ids, input_mask=feature.input_mask, segment_ids=feature.segment_ids, label_ids=feature.label_id, is_real_example=int(feature.is_real_example))) return features
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
use_one_hot_embeddings = False params_senteval['classifier'] = { 'nhid': nhid, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4 } tf.logging.set_verbosity(tf.logging.INFO) layer_indexes = [layers] bert_config = modeling.BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=master, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=num_tpu_cores, per_host_input_for_training=is_per_host)) #####bert class InputExample(object): def __init__(self, unique_id, text_a, text_b): self.unique_id = unique_id self.text_a = text_a self.text_b = text_b
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): if FLAGS.max_seq_length > 512: raise ValueError( "Cannot use sequence length {:d} because the BERT model " "was only trained up to sequence length {:d}".format( FLAGS.max_seq_length, 512)) processor = raw_data_utils.get_processor(FLAGS.task_name) if FLAGS.xlnet == False: # Create tokenizer tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) else: sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=False) return encode_ids(sp, text) if FLAGS.data_type == "sup": sup_out_dir = FLAGS.output_base_dir logging.info("Create sup. data: subset {} => {}".format( FLAGS.sub_set, sup_out_dir)) if FLAGS.xlnet == True: proc_and_save_sup_data_xlnet( processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir, tokenize_fn, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size, ) else: proc_and_save_sup_data( processor, FLAGS.sub_set, FLAGS.raw_data_dir, sup_out_dir, tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.worker_id, FLAGS.replicas, FLAGS.sup_size, ) elif FLAGS.data_type == "unsup": assert FLAGS.aug_ops is not None, \ "aug_ops is required to preprocess unsupervised data." unsup_out_dir = os.path.join( FLAGS.output_base_dir, FLAGS.aug_ops, str(FLAGS.aug_copy_num)) data_stats_dir = os.path.join(FLAGS.raw_data_dir, "data_stats") logging.info("Create unsup. data: subset {} => {}".format( FLAGS.sub_set, unsup_out_dir)) if FLAGS.xlnet == True: proc_and_save_unsup_data_xlnet( processor, FLAGS.sub_set, FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir, tokenize_fn, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.aug_ops, FLAGS.aug_copy_num, FLAGS.worker_id, FLAGS.replicas) else: proc_and_save_unsup_data( processor, FLAGS.sub_set, FLAGS.raw_data_dir, data_stats_dir, unsup_out_dir, tokenizer, FLAGS.max_seq_length, FLAGS.trunc_keep_right, FLAGS.aug_ops, FLAGS.aug_copy_num, FLAGS.worker_id, FLAGS.replicas)
args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # Log GPU information logger.add_text('info', f"args: {args}") # Modify batch size if accumulating gradients args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Reproducibility utils.set_seeds(args.seed, multi_gpu=args.n_gpu > 0) # Build dataloaders tokenizer = tokenization.FullTokenizer(args.vocab, do_lower_case=args.do_lower_case) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ PipelineForPretrain( max_pred=20, # what is this? mask_prob=0.15, # actually this does nothing vocab_words=list(tokenizer.vocab.keys()), # indexer=tokenizer.convert_tokens_to_ids, max_len=args.max_seq_length) ] dataloader = SentencePairDataLoader(args.text_file, batch_size=args.train_batch_size, tokenize=tokenize, max_len=args.max_seq_length, pipeline=pipeline)
help='sequence length (default: 32)') parser.add_argument('--input_file', type=str, default="", metavar='STRING', help='input file path') parser.add_argument('--vocab_file', type=str, default="", metavar='STRING', help='vocab file path') args = parser.parse_args() input_file = args.input_file max_seq_length = args.max_seq_length tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) with tf.gfile.Open(input_file, "r") as f: reader = csv.reader(f, delimiter="\t") for line in reader: text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) a_input_ids, a_input_mask, a_segment_ids = convert_single_example( text_a, None, max_seq_length=max_seq_length, tokenizer=tokenizer) b_input_ids, b_input_mask, b_segment_ids = convert_single_example( text_b, None, max_seq_length=max_seq_length,
def __init__(self, vocab_file, do_lower_case, max_seq_len): self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) self.max_seq_len = max_seq_len
[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'y_end': array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)} ''' with open(save_path, 'wb') as f: pickle.dump(meta, f) if __name__ == '__main__': # Load tokenizer tokenizer = tokenization.FullTokenizer(vocab_file='glove.42B.300d.txt1', do_lower_case=False) train_examples = read_squad_examples( input_file='original_data/train_sample.json', is_training=True) dev_examples = read_squad_examples( input_file='original_data/train_sample.json', is_training=False) train_features = convert_examples_to_features(train_examples, tokenizer, max_seq_length=400, max_query_length=50, is_training=True) dev_features = convert_examples_to_features(dev_examples, tokenizer, max_seq_length=400, max_query_length=50,
def define_train_eval_input_fn(): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mismnli": MisMnliProcessor, "mrpc": MrpcProcessor, "rte": RteProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "wnli": WnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) train_file = os.path.join(FLAGS.output_dir, "train.tf_record") if (tf.gfile.Exists(train_file) == False): file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") if (tf.gfile.Exists(eval_file) == False): file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) return label_list, train_input_fn, num_train_steps, eval_input_fn, eval_steps, num_warmup_steps
def main(_): hvd.init() FLAGS.model_dir = FLAGS.model_dir if hvd.rank() == 0 else os.path.join( FLAGS.model_dir, str(hvd.rank())) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) #FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size() #FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size() tf.logging.set_verbosity(tf.logging.INFO) processor = raw_data_utils.get_processor(FLAGS.task_name) label_list = processor.get_labels() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file, FLAGS.model_dropout) tf.gfile.MakeDirs(FLAGS.model_dir) flags_dict = tf.app.flags.FLAGS.flag_values_dict() with tf.gfile.Open(os.path.join(FLAGS.model_dir, "FLAGS.json"), "w") as ouf: json.dump(flags_dict, ouf) tf.logging.info("warmup steps {}/{}".format(FLAGS.num_warmup_steps, FLAGS.num_train_steps)) save_checkpoints_steps = 500 #FLAGS.num_train_steps // FLAGS.save_checkpoints_num tf.logging.info("setting save checkpoints steps to {:d}".format( save_checkpoints_steps)) FLAGS.iterations_per_loop = min(save_checkpoints_steps, FLAGS.iterations_per_loop) if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None # if not FLAGS.use_tpu and FLAGS.num_gpu > 1: # train_distribute = tf.contrib.distribute.MirroredStrategy( # num_gpus=FLAGS.num_gpu) # else: # train_distribute = None is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=1, # train_distribute=train_distribute, session_config=config, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, per_host_input_for_training=is_per_host)) model_fn = uda.model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, clip_norm=FLAGS.clip_norm, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings, num_labels=len(label_list), unsup_ratio=FLAGS.unsup_ratio, uda_coeff=FLAGS.uda_coeff, tsa=FLAGS.tsa, print_feature=False, print_structure=False, ) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, params={"model_dir": FLAGS.model_dir}, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info(" >>> sup data dir : {}".format( FLAGS.sup_train_data_dir)) if FLAGS.unsup_ratio > 0: tf.logging.info(" >>> unsup data dir : {}".format( FLAGS.unsup_data_dir)) train_input_fn = proc_data_utils.training_input_fn_builder( FLAGS.sup_train_data_dir, FLAGS.unsup_data_dir, FLAGS.aug_ops, FLAGS.aug_copy, FLAGS.unsup_ratio) train_size = processor.get_train_size(FLAGS.raw_data_dir) train_steps = int(train_size / FLAGS.train_batch_size) if FLAGS.do_eval: tf.logging.info(" >>> dev data dir : {}".format(FLAGS.eval_data_dir)) eval_input_fn = proc_data_utils.evaluation_input_fn_builder( FLAGS.eval_data_dir, "clas") eval_size = processor.get_dev_size(FLAGS.raw_data_dir) eval_steps = int(eval_size / FLAGS.eval_batch_size) train_eval_input_fn = proc_data_utils.evaluation_input_fn_builder( FLAGS.sup_train_data_dir, "clas") if FLAGS.do_train and FLAGS.do_eval: hooks = [hvd.BroadcastGlobalVariablesHook(0)] tf.logging.info("***** Running training & evaluation *****") tf.logging.info(" Supervised batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Unsupervised batch size = %d", FLAGS.train_batch_size * FLAGS.unsup_ratio) tf.logging.info(" training size = %d", train_size) tf.logging.info(" training num steps = %d", train_steps) tf.logging.info(" evaluation batch size = %d", FLAGS.eval_batch_size) tf.logging.info(" dev num steps = %d", eval_steps) best_acc = 0 for _ in range(0, FLAGS.num_train_steps, save_checkpoints_steps): tf.logging.info("*** Running training ***") estimator.train(input_fn=train_input_fn, steps=save_checkpoints_steps, hooks=hooks) tf.logging.info("*** Running evaluation ***") train_result = estimator.evaluate(input_fn=train_eval_input_fn, steps=train_steps) tf.logging.info(">> Train Results:") for key in train_result.keys(): tf.logging.info(" %s = %s", key, str(train_result[key])) train_result[key] = train_result[key].item() dev_result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) tf.logging.info(">> Results:") for key in dev_result.keys(): tf.logging.info(" %s = %s", key, str(dev_result[key])) dev_result[key] = dev_result[key].item() best_acc = max(best_acc, dev_result["eval_precision"]) tf.logging.info("***** Final evaluation result *****") tf.logging.info("Best acc: {:.3f}\n\n".format(best_acc)) elif FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Supervised batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Unsupervised batch size = %d", FLAGS.train_batch_size * FLAGS.unsup_ratio) tf.logging.info(" Num steps = %d", FLAGS.num_train_steps) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) elif FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Base evaluation batch size = %d", FLAGS.eval_batch_size) tf.logging.info(" Num steps = %d", eval_steps) checkpoint_state = tf.train.get_checkpoint_state(FLAGS.model_dir) best_acc = 0 for ckpt_path in checkpoint_state.all_model_checkpoint_paths: if not tf.gfile.Exists(ckpt_path + ".data-00000-of-00001"): tf.logging.info( "Warning: checkpoint {:s} does not exist".format( ckpt_path)) continue tf.logging.info("Evaluating {:s}".format(ckpt_path)) dev_result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt_path, ) tf.logging.info(">> Results:") for key in dev_result.keys(): tf.logging.info(" %s = %s", key, str(dev_result[key])) dev_result[key] = dev_result[key].item() best_acc = max(best_acc, dev_result["eval_precision"]) tf.logging.info("***** Final evaluation result *****") tf.logging.info("Best acc: {:.3f}\n\n".format(best_acc)) from utils import tokenization tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=True) id2label = dict(zip([i for i in range(len(label_list))], label_list)) result = estimator.predict(input_fn=eval_input_fn) output_line = "" with open("label_test.txt", 'w') as writer: for re in result: sentence = re["input_ids"] gold = re["label_ids"] prediction = re["predict"] # output_line = "\n".join(id2label[id] for id in prediction if id!=0) + "\n" for gold_index, gold_item in enumerate(gold): if gold_item >= 34: gold[gold_index] = 0 for gold_index, gold_item in enumerate(prediction): if gold_item >= 34: gold[gold_index] = 0 for w, gold_label, label in zip( tokenizer.convert_ids_to_tokens([ int(s) for s in sentence ]), [id2label[id] for id in gold], [id2label[id] for id in prediction]): if w == "[PAD]": continue #if label=="NEGATIVE": # continue output_line = output_line + w + " " + gold_label + " " + label + "\n" output_line += "\n" writer.write(output_line)
def main(_): """ Builds the model and runs. """ tf.logging.set_verbosity(tf.logging.INFO) tx.utils.maybe_create_dir(FLAGS.output_dir) bert_pretrain_dir = 'bert_pretrained_models/%s' % FLAGS.config_bert_pretrain # Loads BERT model configuration if FLAGS.config_format_bert == "json": bert_config = model_utils.transform_bert_to_texar_config( os.path.join(bert_pretrain_dir, 'bert_config.json')) elif FLAGS.config_format_bert == 'texar': bert_config = importlib.import_module( 'bert_config_lib.config_model_%s' % FLAGS.config_bert_pretrain) else: raise ValueError('Unknown config_format_bert.') # Loads data processors = { "cola": data_utils.ColaProcessor, "mnli": data_utils.MnliProcessor, "mrpc": data_utils.MrpcProcessor, "xnli": data_utils.XnliProcessor, 'sst': data_utils.SSTProcessor } processor = processors[FLAGS.task.lower()]() num_classes = len(processor.get_labels()) num_train_data = len(processor.get_train_examples(config_data.data_dir)) tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( bert_pretrain_dir, 'vocab.txt'), do_lower_case=FLAGS.do_lower_case) train_dataset = data_utils.get_dataset(processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.train_batch_size, mode='train', output_dir=FLAGS.output_dir) eval_dataset = data_utils.get_dataset(processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.eval_batch_size, mode='eval', output_dir=FLAGS.output_dir) test_dataset = data_utils.get_dataset(processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.test_batch_size, mode='test', output_dir=FLAGS.output_dir) iterator = tx.data.FeedableDataIterator({ 'train': train_dataset, 'eval': eval_dataset, 'test': test_dataset }) batch = iterator.get_next() input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] batch_size = tf.shape(input_ids)[0] input_length = tf.reduce_sum(1 - tf.to_int32(tf.equal(input_ids, 0)), axis=1) # Builds BERT with tf.variable_scope('bert'): embedder = tx.modules.WordEmbedder(vocab_size=bert_config.vocab_size, hparams=bert_config.embed) word_embeds = embedder(input_ids) # Creates segment embeddings for each type of tokens. segment_embedder = tx.modules.WordEmbedder( vocab_size=bert_config.type_vocab_size, hparams=bert_config.segment_embed) segment_embeds = segment_embedder(segment_ids) input_embeds = word_embeds + segment_embeds # The BERT model (a TransformerEncoder) encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder) output = encoder(input_embeds, input_length) # Builds layers for downstream classification, which is also initialized # with BERT pre-trained checkpoint. with tf.variable_scope("pooler"): # Uses the projection of the 1st-step hidden vector of BERT output # as the representation of the sentence bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1) bert_sent_output = tf.layers.dense(bert_sent_hidden, config_downstream.hidden_dim, activation=tf.tanh) output = tf.layers.dropout(bert_sent_output, rate=0.1, training=tx.global_mode_train()) # Adds the final classification layer logits = tf.layers.dense( output, num_classes, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02)) preds = tf.argmax(logits, axis=-1, output_type=tf.int32) accu = tx.evals.accuracy(batch['label_ids'], preds) # Optimization loss = tf.losses.sparse_softmax_cross_entropy(labels=batch["label_ids"], logits=logits) global_step = tf.Variable(0, trainable=False) # Builds learning rate decay scheduler static_lr = config_downstream.lr['static_lr'] num_train_steps = int(num_train_data / config_data.train_batch_size * config_data.max_train_epoch) num_warmup_steps = int(num_train_steps * config_data.warmup_proportion) lr = model_utils.get_lr( global_step, num_train_steps, # lr is a Tensor num_warmup_steps, static_lr) train_op = tx.core.get_train_op(loss, global_step=global_step, learning_rate=lr, hparams=config_downstream.opt) # Train/eval/test routine def _run(sess, mode): fetches = { 'accu': accu, 'batch_size': batch_size, 'step': global_step, 'loss': loss, } if mode == 'train': fetches['train_op'] = train_op while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.TRAIN, } rets = sess.run(fetches, feed_dict) if rets['step'] % 50 == 0: tf.logging.info('step:%d loss:%f' % (rets['step'], rets['loss'])) if rets['step'] == num_train_steps: break except tf.errors.OutOfRangeError: break if mode == 'eval': cum_acc = 0.0 nsamples = 0 while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'eval'), tx.context.global_mode(): tf.estimator.ModeKeys.EVAL, } rets = sess.run(fetches, feed_dict) cum_acc += rets['accu'] * rets['batch_size'] nsamples += rets['batch_size'] except tf.errors.OutOfRangeError: break tf.logging.info('dev accu: {}'.format(cum_acc / nsamples)) if mode == 'test': _all_preds = [] while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } _preds = sess.run(preds, feed_dict=feed_dict) _all_preds.extend(_preds.tolist()) except tf.errors.OutOfRangeError: break output_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_file, "w") as writer: writer.write('\n'.join(str(p) for p in _all_preds)) with tf.Session() as sess: # Loads pretrained BERT model parameters init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt') model_utils.init_bert_checkpoint(init_checkpoint) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) # Restores trained model if specified saver = tf.train.Saver() if FLAGS.checkpoint: saver.restore(sess, FLAGS.checkpoint) iterator.initialize_dataset(sess) if FLAGS.do_train: iterator.restart_dataset(sess, 'train') _run(sess, mode='train') saver.save(sess, FLAGS.output_dir + '/model.ckpt') if FLAGS.do_eval: iterator.restart_dataset(sess, 'eval') _run(sess, mode='eval') if FLAGS.do_test: iterator.restart_dataset(sess, 'test') _run(sess, mode='test')
#!/usr/bin/python3
max_seq_length, tokenizer) def create_int_feature(values): f = tf.train.Feature(int64_list=tf.train.Int64List( value=list(values))) return f features = collections.OrderedDict() features["input_ids"] = create_int_feature(feature.input_ids) features["input_mask"] = create_int_feature(feature.input_mask) features["segment_ids"] = create_int_feature(feature.segment_ids) features["label_id"] = create_int_feature([feature.label_id]) features["is_real_example"] = create_int_feature( [int(feature.is_real_example)]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(tf_example.SerializeToString()) writer.close() if __name__ == '__main__': max_seq_length = 64 tokenizer = tokenization.FullTokenizer( "/home/geb/PycharmProjects/bert_ngc/vocab_file/albert_zh/vocab.txt") sc = SentenceClassifierProcessor() examples = sc.get_train_examples("/home/geb/PycharmProjects/bert/data_dir") file_based_convert_examples_to_features( examples, sc.get_labels(), max_seq_length, tokenizer, "../tf_records/sentence_classifier/train.record0")
'context_id': context_idxss, 'question_id': ques_idxss, 'context_char_id': context_char_idxss, 'question_char_id': ques_char_idxss, 'y_start': y1s, 'y_end': y2s } print('save to', save_path, len(qids), 'features') with open(save_path, 'wb') as f: pickle.dump(meta, f) if __name__ == '__main__': # Load tokenizer tokenizer = tokenization.FullTokenizer( vocab_file='original_data/glove.840B.300d.txt', do_lower_case=False) train_examples = read_squad_examples( input_file='original_data/train-v1.1.json', is_training=True) dev_examples = read_squad_examples( input_file='original_data/dev-v1.1.json', is_training=False) train_features = convert_examples_to_features(train_examples, tokenizer, max_seq_length=400, max_query_length=50, is_training=True) dev_features = convert_examples_to_features(dev_examples, tokenizer, max_seq_length=400, max_query_length=50, is_training=False)
def proc_and_save_unsup_data_xlnet( processor, sub_set, raw_data_dir, data_stats_dir, unsup_out_dir, tokenize_fn, max_seq_length, trunc_keep_right, aug_ops, aug_copy_num, worker_id, replicas): # print random seed just to double check that we use different random seeds # for different runs so that we generate different augmented examples for the same original example. random_seed = np.random.randint(0, 100000) logging.info("random seed: {:d}".format(random_seed)) np.random.seed(random_seed) logging.info("getting examples") if sub_set == "train": ori_examples = processor.get_train_examples(raw_data_dir) elif sub_set.startswith("unsup"): print(sub_set) ori_examples = processor.get_unsup_examples(raw_data_dir, sub_set) else: assert False # this is the size before spliting data for each worker data_total_size = len(ori_examples) if replicas != -1: ori_examples, start, end = get_data_for_worker( ori_examples, replicas, worker_id) else: start = 0 end = len(ori_examples) logging.info("getting augmented examples") aug_examples = copy.deepcopy(ori_examples) # Doesn't do anything for tf-idf augmentation aug_examples = sent_level_augment.run_augment( aug_examples, aug_ops, sub_set, aug_copy_num, start, end, data_total_size) labels = processor.get_labels() + ["unsup"] logging.info("processing ori examples with labels: {}".format(labels)) ori_features = file_based_convert_examples_to_features( ori_examples, labels, max_seq_length, tokenize_fn, num_passes=1) tokenized_ori_examples = tokenize_examples( ori_examples, tokenization.FullTokenizer(do_lower_case=False)) if "idf" in aug_ops: data_stats = get_data_stats( data_stats_dir, sub_set, -1, replicas, tokenized_ori_examples) else: data_stats = None logging.info("processing aug examples using aug ops {}".format(aug_ops)) aug_features = file_based_convert_examples_to_features( aug_examples, labels, max_seq_length, tokenize_fn, num_passes=1, data_stats=data_stats, aug_ops=aug_ops) logging.info("{} Original Features".format(len(ori_features))) logging.info("{} Augmented Features".format(len(aug_features))) unsup_features = [] for ori_feat, aug_feat in zip(ori_features, aug_features): unsup_features.append(PairedUnsupInputFeaturesXL( ori_feat.input_ids, ori_feat.input_mask, ori_feat.segment_ids, ori_feat.is_real_example, aug_feat.input_ids, aug_feat.input_mask, aug_feat.segment_ids, aug_feat.is_real_example )) logging.info("There are {} total unsupervised records".format(len(unsup_features))) dump_tfrecord(unsup_features, unsup_out_dir, worker_id)
import sys, os sys.path.append(os.getcwd()) import glob import tensorflow.compat.v1 as tf import numpy as np import cv2 import argparse import time import traceback import json import utils.tokenization as tokenization from utils.train_util import get_label_name_dict from src.feats_extract.multimodal_feature_extract import MultiModalFeatureExtract #################Inference Utils################# tokenizer = tokenization.FullTokenizer( vocab_file='pretrain_models/robert/chinese_L-12_H-768_A-12/vocab.txt') class TaggingModel(): def __init__(self, configs): tag_id_file = configs.get('tag_id_file', None) model_pb = configs.get('model_pb', None) if tag_id_file is None: raise else: self.label_name_dict = get_label_name_dict(tag_id_file, None) if model_pb is None: raise else: config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True