def tokenizer(): train_examples = read_squad_examples( input_file='./data/train-v2.0.json', is_training=True, version_2_with_negative=True) tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True) for (example_index, example) in enumerate(train_examples): query_tokens = tokenizer.tokenize(example.question_text) print(query_tokens) input_ids = tokenizer.convert_tokens_to_ids(query_tokens) print(input_ids)
def __init__( self, eval_script: str = "data/squad/v1.1/evaluate-v1.1.py", predict_file: str = "", output_dir: str = "./", n_best_size: int = 20, max_answer_length: int = 30, version_2_with_negative: bool = False, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, vocab_file: str = "", do_lower_case: bool = True, max_len: int = 512, ): tokenizer = BertTokenizer(vocab_file, do_lower_case=do_lower_case, max_len=max_len) # for bert large self.eval_examples = read_squad_examples( input_file=predict_file, is_training=False, version_2_with_negative=version_2_with_negative) self.eval_features = convert_examples_to_features( examples=self.eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, ) self.output_dir = output_dir self.eval_script = eval_script self.predict_file = predict_file args = Namespace( version_2_with_negative=version_2_with_negative, n_best_size=n_best_size, max_answer_length=max_answer_length, verbose_logging=False, do_lower_case=do_lower_case, ) self.args = args self.all_results: List[RawResult] = []
def get_dataloader(args): ''' return dataloader for inference ''' # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) with open(cached_features_file, "wb") as writer: pickle.dump(eval_features, writer) data = [] for feature in eval_features: input_ids = torch.tensor(feature.input_ids, dtype=torch.int64) input_mask = torch.tensor(feature.input_mask, dtype=torch.int64) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64) inp = (input_ids, segment_ids, input_mask) data.append(inp) if args.nbatches > 0: data = data[:args.nbatches*args.batch_size] test_loader = torch.utils.data.DataLoader( data, batch_size=args.batch_size, shuffle=False, num_workers=1, pin_memory=True) return test_loader
def process_data_and_get_input_max_min(data_list, fixer, input_tensor_names, num_runs, vocab_file, do_lower_case, seq_length, doc_stride=128, max_query_length=64, batch_size=8, preprocess_fn="default_preprocess"): """Precess input data and get input max and min. """ eval_features = [] def append_feature(feature): eval_features.append(feature) eval_examples = read_squad_examples(input_file=data_list, is_training=False) eval_examples = eval_examples[0:batch_size * num_runs] tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature) input_dicts = [] input_node_names = [ node_name.split(':')[0] for node_name in input_tensor_names ] for i in range(num_runs): inputs = process_feature_batch(eval_features, batch_size, i) input_dict = dict(zip(input_node_names, inputs)) input_dicts.append(input_dict) fixer.get_input_max_min(input_dicts, batch_size) print("quantize input end")
# Predict all tokens start_logits, end_logits = model(tokens_tensor, segments_tensors,input_mask) start_ind=torch.argmax(start_logits).item() end_ind=torch.argmax(end_logits).item() print(all_tokens[start_ind:end_ind+1]) # #Messing around, trying to recreate what happened in run_squad.py predict_file='/data/squad/dev-v1.1.json' #eval_examples is a list of 10570 'SquadExample' objects #each object contains fields for qas_id, question_text, and doc_tokens, eval_examples = run_squad.read_squad_examples(input_file=predict_file, is_training=False) eval_features = run_squad.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False) #write_predictions(eval_examples, eval_features, all_results, # args.n_best_size, args.max_answer_length, # args.do_lower_case, output_prediction_file, # output_nbest_file, args.verbose_logging)
def _validate_squad(args, model, tokenizer): eval_examples = run_squad.read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = run_squad.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) run_squad.logger.info("***** Running predictions *****") run_squad.logger.info(" Num orig examples = %d", len(eval_examples)) run_squad.logger.info(" Num split examples = %d", len(eval_features)) run_squad.logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = run_squad.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = run_squad.SequentialSampler(eval_data) eval_dataloader = run_squad.DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] run_squad.logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in run_squad.tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: run_squad.logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.cuda() input_mask = input_mask.cuda() segment_ids = segment_ids.cuda() with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( run_squad.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join("predictions.json") output_nbest_file = os.path.join("nbest_predictions.json") output_null_log_odds_file = os.path.join("null_odds.json") run_squad.write_predictions( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) result = _calc_metric_squad(args.predict_file, output_prediction_file) os.remove(output_prediction_file) os.remove(output_nbest_file) os.remove(output_null_log_odds_file) return result # {'exact_match': exact_match, 'f1': f1}
def _train_squad(args, stage): args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps _set_seed(args.seed) tokenizer = run_squad.BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None train_examples = run_squad.read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs config = run_squad.BertConfig.from_json_file(args.config_file) model: nn.Module = run_squad.BertForQuestionAnswering(config) _load_checkpoint(model, args.init_checkpoint) if stage == PruningPhase.admm: _hard_mask(model, args.sparsity_config) model.cuda() if args.fp16 and args.old: model.half() with open(args.sparsity_config, 'r') as f: raw_dict = yaml.load(f, Loader=yaml.SafeLoader) masks = dict.fromkeys(raw_dict['prune_ratios'].keys()) plain_model = getattr(model, 'module', model) for param_name in masks: param = get_parameter_by_name(plain_model, param_name) if param is None: raise Exception(f'Cannot find {param_name}') non_zero_mask = torch.ne(param, 0).to(param.dtype) masks[param_name] = non_zero_mask # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from fused_adam_local import FusedAdamBert as FusedAdam from apex.optimizers import FusedAdam from apex.fp16_utils.fp16_optimizer import FP16_Optimizer # from apex.contrib.optimizers import FP16_Optimizer except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) # import ipdb; ipdb.set_trace() optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: if args.old: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: if args.old: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) if not args.old and args.do_train: scheduler = run_squad.LinearWarmUpScheduler( optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps) else: optimizer = run_squad.BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) model = torch.nn.DataParallel(model) global_step = 0 cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) # train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = run_squad.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: run_squad.logger.info( " Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) run_squad.logger.info("***** Running training *****") run_squad.logger.info(" Num orig examples = %d", len(train_examples)) run_squad.logger.info(" Num split examples = %d", len(train_features)) run_squad.logger.info(" Batch size = %d", args.train_batch_size) run_squad.logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = run_squad.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_sampler = run_squad.RandomSampler(train_data) train_dataloader = run_squad.DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in run_squad.trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( run_squad.tqdm(train_dataloader, desc="Iteration")): # Terminate early for benchmarking if args.max_steps > 0 and global_step > args.max_steps: break if torch.cuda.device_count() == 1: batch = tuple( t.cuda() for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if torch.cuda.device_count() > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: if args.old: # noinspection PyUnboundLocalVariable optimizer.backward(loss) else: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # if args.fp16: # optimizer.backward(loss) # else: # loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT which FusedAdam doesn't do if not args.old: # noinspection PyUnboundLocalVariable scheduler.step() else: lr_this_step = args.learning_rate * run_squad.warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 plain_model = getattr(model, 'module', model) for param_name, mask in masks.items(): param = get_parameter_by_name(plain_model, param_name) param.data *= mask.to(param.dtype) if step % args.log_freq == 0: # logger.info("Step {}: Loss {}, LR {} ".format(global_step, loss.item(), lr_this_step)) run_squad.logger.info("Step {}: Loss {}, LR {} ".format( global_step, loss.item(), optimizer.param_groups[0]['lr'])) return model, tokenizer
infer_ctx = InferContext(args.url, protocol, args.model_name, model_version, http_headers=args.http_headers, verbose=args.verbose) # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format( args.max_seq_length, args.doc_stride) eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) with open(cached_features_file, "wb") as writer: pickle.dump(eval_features, writer)
# Create the tokenizer. tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) #%% # Load the configuration from file bert_config = modeling.BertConfig.from_json_file(bert_config_file) config = tf.compat.v1.ConfigProto(log_device_placement=True) run_config = tf.estimator.RunConfig(model_dir=output_dir, session_config=config, save_checkpoints_steps=1000, keep_checkpoint_max=1) #%% # Read the training examples from the training file: train_examples = run_squad.read_squad_examples(input_file=train_file, is_training=True) num_train_steps = int( len(train_examples) / global_batch_size * num_train_epochs) num_warmup_steps = int(num_train_steps * warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) start_index = 0 end_index = len(train_examples) tmp_filenames = os.path.join(output_dir, "train.tf_record") # We write to a temporary file to avoid storing very large constant tensors
def main(_): tf.logging.set_verbosity(tf.logging.INFO) bert_config = rs.modeling.BertConfig.from_json_file(FLAGS.bert_config_file) rs.validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = rs.tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = rs.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = rs.model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. train_writer = rs.FeatureWriter( filename=os.path.join(FLAGS.output_dir, "train.tf_record"), is_training=True) rs.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = rs.input_fn_builder( input_file=train_writer.filename, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: eval_examples = rs.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) act_seq_len = get_act_seq_len(eval_examples, tokenizer, FLAGS.max_seq_length, FLAGS.doc_stride, FLAGS.max_query_length) eval_writer = rs.FeatureWriter( filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) rs.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) all_results = [] predict_input_fn = rs.input_fn_builder( input_file=eval_writer.filename, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) # If running eval on the TPU, you will need to specify the number of # steps. all_results = [] for idx, result in enumerate(estimator.predict( predict_input_fn, yield_single_examples=True)): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( rs.RawResult( unique_id=unique_id, start_logits=start_logits[:act_seq_len[idx]], end_logits=end_logits[:act_seq_len[idx]])) output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") rs.write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
def get_dataloader_fn( precision: str = 'fp32', batch_size: int = 8, vocab_file: str = "", do_lower_case: bool = True, predict_file: str = "", max_len: int = 512, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, version_2_with_negative: bool = False, pad_to_batch_size: bool = True, ): # Preprocess input data tokenizer = BertTokenizer(vocab_file, do_lower_case=do_lower_case, max_len=max_len) eval_examples = read_squad_examples( input_file=predict_file, is_training=False, version_2_with_negative=version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, ) # get inputs all_unique_ids = [f.unique_id for f in eval_features] all_input_ids = [f.input_ids for f in eval_features] all_input_mask = [f.input_mask for f in eval_features] all_segment_ids = [f.segment_ids for f in eval_features] if pad_to_batch_size: # each batch should have a fixed size f = eval_features[-1] padding = batch_size - (len(all_unique_ids) % batch_size) all_unique_ids += [f.unique_id for _ in range(padding)] all_input_ids += [f.input_ids for _ in range(padding)] all_input_mask += [f.input_mask for _ in range(padding)] all_segment_ids += [f.segment_ids for _ in range(padding)] all_unique_ids = torch.tensor(all_unique_ids, dtype=torch.int32, requires_grad=False) all_input_ids = torch.tensor(all_input_ids, dtype=torch.int32, requires_grad=False) all_input_mask = torch.tensor(all_input_mask, dtype=torch.int32, requires_grad=False) all_segment_ids = torch.tensor(all_segment_ids, dtype=torch.int32, requires_grad=False) eval_data = torch.utils.data.TensorDataset(all_unique_ids, all_input_ids, all_input_mask, all_segment_ids) eval_sampler = torch.utils.data.SequentialSampler(eval_data) eval_dataloader = torch.utils.data.DataLoader( eval_data, sampler=eval_sampler, batch_size=batch_size, shuffle=False, num_workers=0, ) dtype = {'fp32': np.float32, 'fp16': np.float16} dtype = dtype[precision] def _get_dataloader(): """return dataloader for inference""" for unique_id, input_ids, input_mask, segment_ids in eval_dataloader: unique_id = unique_id.cpu().numpy() input_ids = input_ids.cpu().numpy() input_mask = input_mask.cpu().numpy() segment_ids = segment_ids.cpu().numpy() x = { "input__0": input_ids, "input__1": segment_ids, "input__2": input_mask } y_real = { "output__0": np.zeros([batch_size, max_seq_length], dtype=dtype), "output__1": np.zeros([batch_size, max_seq_length], dtype=dtype), } yield (unique_id, x, y_real) return _get_dataloader
import json import sys sys.path.insert(0, '../bert') from run_squad import read_squad_examples import tokenization from konlpy.tag import Mecab if __name__ == "__main__": vocab = set() mecab = Mecab('../mecab-ko-dic-2.1.1-20180720') train_examples = read_squad_examples('./KorQuAD_v1.0_train.json', is_training=True) dev_examples = read_squad_examples('./KorQuAD_v1.0_dev.json', is_training=True) tokenizer = tokenization.FullTokenizer(vocab_file='./vocab.txt', do_lower_case=False) def add_to_vocab(vocab, tokenizer, examples): for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) vocab |= set(query_tokens) for (i, token) in enumerate(example.doc_tokens): sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: vocab |= set(sub_tokens) print("starting build vocab") add_to_vocab(vocab, tokenizer, train_examples) add_to_vocab(vocab, tokenizer, dev_examples) print("finished adding vocabs") with open('./vocab.txt', 'w') as file:
def get_dataset(self, dataset_path, is_training, context_truncated_len=400, utterance_truncated_len=100): examples = read_squad_examples(dataset_path, is_training) if self.ctx_emb == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif self.ctx_emb == 'xlnet': tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') features = convert_examples_to_features(examples, tokenizer, max_seq_length=2500, doc_stride=2500, max_query_length=2500, is_training=is_training) with open(dataset_path) as f: raw_examples = json.load(f) # since problems are flatten by convert_examples_to_features index_feature = 0 for example in tqdm(raw_examples['data']): for paragraph in example['paragraphs']: paragraph['context_raw'] = paragraph['context'] # Since only `qa_feature.token_to_orig_map` (below) maps token # to space-splited-word-level indices in the context, # `word_offsets` is required to map space-splited-word-level # indices to char-level indices. word_offsets = [0] for word in paragraph['context'].split(' '): word_offsets.append(len(word) + 1 + word_offsets[-1]) for index_q, qa in enumerate(paragraph['qas']): qa_feature = features[index_feature] index_feature += 1 # in `features[index_feature].segment_ids`, question and # context are concatenated. To seperate them, 0/1 stored # in `segment_ids` are used. question_len = qa_feature.segment_ids.index(1) question = qa_feature.input_ids[:question_len] if index_q == 0: # do only once for a paragraph context_len = \ qa_feature.segment_ids[question_len:].index(0) context = ( # [question[0]] # [CLS] token qa_feature.input_ids[question_len:question_len + context_len]) paragraph['context_offset'] = ( # [0] [ word_offsets[qa_feature.token_to_orig_map[i]] for i in range(question_len, question_len + context_len - 1) ] + [len(paragraph['context'])]) paragraph['context_tokenized'] = qa_feature.input_ids paragraph['context'] = context qa['question_tokenized'] = tokenizer.tokenize( qa['question']) qa['question'] = question qa['orig_answer_raw'] = qa['orig_answer']['text'] qa['orig_answer_text'] = tokenizer.tokenize( qa['orig_answer_raw']) qa['orig_answer_start'] = qa_feature.start_position - question_len qa['orig_answer_end'] = qa_feature.end_position - question_len assert qa['orig_answer_end'] < len(paragraph['context']) # answer indicator for previous questions qa['answer_indicator'] = [0] * context_len for offset in range(1, min(3 + 1, index_q + 1)): index_prev = index_q - offset start, end = ( paragraph['qas'][index_prev]['orig_answer_start'], paragraph['qas'][index_prev]['orig_answer_end'] + 1) qa['answer_indicator'][start:end] = ([offset] * (end - start)) if is_training: for answer in qa['answers']: answer['raw'] = answer['text'] answer['text'] = tokenizer.tokenize(answer['text']) return QuACDataset(raw_examples['data'], context_truncated_len=context_truncated_len, utterance_truncated_len=utterance_truncated_len, padding=0)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--json_file", default=None, type=str, help= "predictions jsonfile location (output of run_squad). E.g., train-v1.1.json" ) parser.add_argument( "--output_dir", default=None, type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--OG", action='store_true', help="test") args = parser.parse_args() with open(args.json_file, "r", encoding='utf-8') as reader: input_data = json.load(reader) # if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) train_examples = run_squad.read_squad_examples( args.json_file, is_training=True, version_2_with_negative=True) max_seq_len = 384 max_query_len = 64 max_answer_len = 30 exceed_seq_lens = [] exceed_query_lens = [] exceed_answer_lens = [] exceed_seq_len_counter = 0 exceed_query_len_counter = 0 exceed_answer_len_counter = 0 overall_counter = 0 max_s = 0 max_q = 0 max_a = 0 tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased', do_lower_case=True) # added_flag, currently hardcoded train_features = run_squad.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=512, doc_stride=128, max_query_length=512, is_training=True) for example in train_features: overall_counter += 1 if sum(example.input_mask) > max_seq_len: exceed_seq_lens.append(example.tokens) exceed_seq_len_counter += 1 if sum(example.input_mask) > max_s: max_s = sum(example.input_mask) if sum(example.segment_ids_flipped) > max_query_len: exceed_query_lens.append(example.tokens) exceed_query_len_counter += 1 if sum(example.segment_ids_flipped) > max_q: max_q = sum(example.segment_ids_flipped) if (example.end_position - example.start_position) > max_answer_len: exceed_answer_len_counter += 1 exceed_answer_lens.append(example.tokens) if (example.end_position - example.start_position) > max_a: max_a = (example.end_position - example.start_position) print("Number of examples: %d." % overall_counter) print("Number of sequences that exceeded max_seq_len of %d is %d." % (max_seq_len, exceed_seq_len_counter)) print("Number of queries that exceeded max_query_len of %d is %d." % (max_query_len, exceed_query_len_counter)) print("Number of answers that exceeded max_answer_len of %d is %d." % (max_answer_len, exceed_answer_len_counter)) print("Max seq length found was %d." % max_s) print("Max query length found was %d." % max_q) print("Max answer length found was %d." % max_a)