Exemple #1
0
def tokenizer():
    train_examples = read_squad_examples(
        input_file='./data/train-v2.0.json', is_training=True, version_2_with_negative=True)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
    
    for (example_index, example) in enumerate(train_examples):
        query_tokens = tokenizer.tokenize(example.question_text)
        print(query_tokens)
        
        input_ids = tokenizer.convert_tokens_to_ids(query_tokens)
        print(input_ids)
Exemple #2
0
    def __init__(
        self,
        eval_script: str = "data/squad/v1.1/evaluate-v1.1.py",
        predict_file: str = "",
        output_dir: str = "./",
        n_best_size: int = 20,
        max_answer_length: int = 30,
        version_2_with_negative: bool = False,
        max_seq_length: int = 384,
        doc_stride: int = 128,
        max_query_length: int = 64,
        vocab_file: str = "",
        do_lower_case: bool = True,
        max_len: int = 512,
    ):

        tokenizer = BertTokenizer(vocab_file,
                                  do_lower_case=do_lower_case,
                                  max_len=max_len)  # for bert large

        self.eval_examples = read_squad_examples(
            input_file=predict_file,
            is_training=False,
            version_2_with_negative=version_2_with_negative)

        self.eval_features = convert_examples_to_features(
            examples=self.eval_examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=False,
        )

        self.output_dir = output_dir
        self.eval_script = eval_script
        self.predict_file = predict_file

        args = Namespace(
            version_2_with_negative=version_2_with_negative,
            n_best_size=n_best_size,
            max_answer_length=max_answer_length,
            verbose_logging=False,
            do_lower_case=do_lower_case,
        )

        self.args = args

        self.all_results: List[RawResult] = []
def get_dataloader(args):
    ''' return dataloader for inference '''
    
    # Preprocess input data
    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
    
    cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride)
    try:
        with open(cached_features_file, "rb") as reader:
            eval_features = pickle.load(reader)
    except:
        eval_examples = read_squad_examples(
            input_file=args.predict_file,
            is_training=False,
            version_2_with_negative=args.version_2_with_negative)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)
        with open(cached_features_file, "wb") as writer:
            pickle.dump(eval_features, writer)
    
    data = []
    for feature in eval_features:
        input_ids = torch.tensor(feature.input_ids, dtype=torch.int64)
        input_mask = torch.tensor(feature.input_mask, dtype=torch.int64)
        segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64)
        inp = (input_ids, segment_ids, input_mask)
        data.append(inp)
    
    if args.nbatches > 0:
        data = data[:args.nbatches*args.batch_size]
    
    test_loader = torch.utils.data.DataLoader(
        data, 
        batch_size=args.batch_size, 
        shuffle=False, 
        num_workers=1, 
        pin_memory=True)
    
    return test_loader
Exemple #4
0
def process_data_and_get_input_max_min(data_list,
                                       fixer,
                                       input_tensor_names,
                                       num_runs,
                                       vocab_file,
                                       do_lower_case,
                                       seq_length,
                                       doc_stride=128,
                                       max_query_length=64,
                                       batch_size=8,
                                       preprocess_fn="default_preprocess"):
    """Precess input data and get input max and min.
  """
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)

    eval_examples = read_squad_examples(input_file=data_list,
                                        is_training=False)
    eval_examples = eval_examples[0:batch_size * num_runs]
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)

    convert_examples_to_features(examples=eval_examples,
                                 tokenizer=tokenizer,
                                 max_seq_length=seq_length,
                                 doc_stride=doc_stride,
                                 max_query_length=max_query_length,
                                 is_training=False,
                                 output_fn=append_feature)
    input_dicts = []
    input_node_names = [
        node_name.split(':')[0] for node_name in input_tensor_names
    ]
    for i in range(num_runs):
        inputs = process_feature_batch(eval_features, batch_size, i)
        input_dict = dict(zip(input_node_names, inputs))
        input_dicts.append(input_dict)
    fixer.get_input_max_min(input_dicts, batch_size)

    print("quantize input end")
# Predict all tokens
start_logits, end_logits = model(tokens_tensor, segments_tensors,input_mask)
start_ind=torch.argmax(start_logits).item()
end_ind=torch.argmax(end_logits).item()

print(all_tokens[start_ind:end_ind+1])




#
#Messing around, trying to recreate what happened in run_squad.py

predict_file='/data/squad/dev-v1.1.json'
#eval_examples is a list of 10570 'SquadExample' objects
#each object contains fields for qas_id, question_text, and doc_tokens, 
eval_examples = run_squad.read_squad_examples(input_file=predict_file, is_training=False)

eval_features = run_squad.convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=False)

#write_predictions(eval_examples, eval_features, all_results,
#                          args.n_best_size, args.max_answer_length,
#                          args.do_lower_case, output_prediction_file,
#                          output_nbest_file, args.verbose_logging)
def _validate_squad(args, model, tokenizer):
    eval_examples = run_squad.read_squad_examples(
        input_file=args.predict_file,
        is_training=False,
        version_2_with_negative=args.version_2_with_negative)

    eval_features = run_squad.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    run_squad.logger.info("***** Running predictions *****")
    run_squad.logger.info("  Num orig examples = %d", len(eval_examples))
    run_squad.logger.info("  Num split examples = %d", len(eval_features))
    run_squad.logger.info("  Batch size = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = run_squad.TensorDataset(all_input_ids, all_input_mask,
                                        all_segment_ids, all_example_index)
    # Run prediction for full data
    eval_sampler = run_squad.SequentialSampler(eval_data)
    eval_dataloader = run_squad.DataLoader(eval_data,
                                           sampler=eval_sampler,
                                           batch_size=args.predict_batch_size)

    model.eval()
    all_results = []
    run_squad.logger.info("Start evaluating")
    for input_ids, input_mask, segment_ids, example_indices in run_squad.tqdm(
            eval_dataloader, desc="Evaluating"):
        if len(all_results) % 1000 == 0:
            run_squad.logger.info("Processing example: %d" %
                                  (len(all_results)))
        input_ids = input_ids.cuda()
        input_mask = input_mask.cuda()
        segment_ids = segment_ids.cuda()
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(
                input_ids, segment_ids, input_mask)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(
                run_squad.RawResult(unique_id=unique_id,
                                    start_logits=start_logits,
                                    end_logits=end_logits))
    output_prediction_file = os.path.join("predictions.json")
    output_nbest_file = os.path.join("nbest_predictions.json")
    output_null_log_odds_file = os.path.join("null_odds.json")
    run_squad.write_predictions(
        eval_examples, eval_features, all_results, args.n_best_size,
        args.max_answer_length, args.do_lower_case, output_prediction_file,
        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
        args.version_2_with_negative, args.null_score_diff_threshold)

    result = _calc_metric_squad(args.predict_file, output_prediction_file)
    os.remove(output_prediction_file)
    os.remove(output_nbest_file)
    os.remove(output_null_log_odds_file)
    return result  # {'exact_match': exact_match, 'f1': f1}
def _train_squad(args, stage):
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    _set_seed(args.seed)

    tokenizer = run_squad.BertTokenizer(args.vocab_file,
                                        do_lower_case=args.do_lower_case,
                                        max_len=512)  # for bert large
    # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None

    train_examples = run_squad.read_squad_examples(
        input_file=args.train_file,
        is_training=True,
        version_2_with_negative=args.version_2_with_negative)
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs

    config = run_squad.BertConfig.from_json_file(args.config_file)
    model: nn.Module = run_squad.BertForQuestionAnswering(config)
    _load_checkpoint(model, args.init_checkpoint)

    if stage == PruningPhase.admm:
        _hard_mask(model, args.sparsity_config)

    model.cuda()
    if args.fp16 and args.old:
        model.half()

    with open(args.sparsity_config, 'r') as f:
        raw_dict = yaml.load(f, Loader=yaml.SafeLoader)
        masks = dict.fromkeys(raw_dict['prune_ratios'].keys())

    plain_model = getattr(model, 'module', model)

    for param_name in masks:
        param = get_parameter_by_name(plain_model, param_name)
        if param is None: raise Exception(f'Cannot find {param_name}')
        non_zero_mask = torch.ne(param, 0).to(param.dtype)
        masks[param_name] = non_zero_mask

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            # from fused_adam_local import FusedAdamBert as FusedAdam
            from apex.optimizers import FusedAdam
            from apex.fp16_utils.fp16_optimizer import FP16_Optimizer
            # from apex.contrib.optimizers import FP16_Optimizer
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        # import ipdb; ipdb.set_trace()
        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)

        if args.loss_scale == 0:
            if args.old:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level="O2",
                                                  keep_batchnorm_fp32=False,
                                                  loss_scale="dynamic")
        else:
            if args.old:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            else:
                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level="O2",
                                                  keep_batchnorm_fp32=False,
                                                  loss_scale=args.loss_scale)
        if not args.old and args.do_train:
            scheduler = run_squad.LinearWarmUpScheduler(
                optimizer,
                warmup=args.warmup_proportion,
                total_steps=num_train_optimization_steps)

    else:
        optimizer = run_squad.BertAdam(optimizer_grouped_parameters,
                                       lr=args.learning_rate,
                                       warmup=args.warmup_proportion,
                                       t_total=num_train_optimization_steps)

    model = torch.nn.DataParallel(model)

    global_step = 0
    cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
        list(filter(None, args.bert_model.split('/'))).pop(),
        str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length))
    # train_features = None
    try:
        with open(cached_train_features_file, "rb") as reader:
            train_features = pickle.load(reader)
    except:
        train_features = run_squad.convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
            run_squad.logger.info(
                "  Saving train features into cached file %s",
                cached_train_features_file)
            with open(cached_train_features_file, "wb") as writer:
                pickle.dump(train_features, writer)

    run_squad.logger.info("***** Running training *****")
    run_squad.logger.info("  Num orig examples = %d", len(train_examples))
    run_squad.logger.info("  Num split examples = %d", len(train_features))
    run_squad.logger.info("  Batch size = %d", args.train_batch_size)
    run_squad.logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_start_positions = torch.tensor(
        [f.start_position for f in train_features], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in train_features],
                                     dtype=torch.long)
    train_data = run_squad.TensorDataset(all_input_ids, all_input_mask,
                                         all_segment_ids, all_start_positions,
                                         all_end_positions)
    train_sampler = run_squad.RandomSampler(train_data)
    train_dataloader = run_squad.DataLoader(train_data,
                                            sampler=train_sampler,
                                            batch_size=args.train_batch_size)

    model.train()
    for _ in run_squad.trange(int(args.num_train_epochs), desc="Epoch"):
        for step, batch in enumerate(
                run_squad.tqdm(train_dataloader, desc="Iteration")):
            # Terminate early for benchmarking

            if args.max_steps > 0 and global_step > args.max_steps:
                break

            if torch.cuda.device_count() == 1:
                batch = tuple(
                    t.cuda()
                    for t in batch)  # multi-gpu does scattering it-self
            input_ids, input_mask, segment_ids, start_positions, end_positions = batch
            loss = model(input_ids, segment_ids, input_mask, start_positions,
                         end_positions)
            if torch.cuda.device_count() > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                if args.old:
                    # noinspection PyUnboundLocalVariable
                    optimizer.backward(loss)
                else:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
            else:
                loss.backward()
            # if args.fp16:
            #    optimizer.backward(loss)
            # else:
            #    loss.backward()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up for BERT which FusedAdam doesn't do
                    if not args.old:
                        # noinspection PyUnboundLocalVariable
                        scheduler.step()
                    else:
                        lr_this_step = args.learning_rate * run_squad.warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

                plain_model = getattr(model, 'module', model)
                for param_name, mask in masks.items():
                    param = get_parameter_by_name(plain_model, param_name)
                    param.data *= mask.to(param.dtype)

            if step % args.log_freq == 0:
                # logger.info("Step {}: Loss {}, LR {} ".format(global_step, loss.item(), lr_this_step))
                run_squad.logger.info("Step {}: Loss {}, LR {} ".format(
                    global_step, loss.item(), optimizer.param_groups[0]['lr']))

    return model, tokenizer
    infer_ctx = InferContext(args.url,
                             protocol,
                             args.model_name,
                             model_version,
                             http_headers=args.http_headers,
                             verbose=args.verbose)

    # Preprocess input data
    tokenizer = BertTokenizer(args.vocab_file,
                              do_lower_case=args.do_lower_case,
                              max_len=512)  # for bert large
    cached_features_file = args.predict_file + '_{}_{}.bin'.format(
        args.max_seq_length, args.doc_stride)

    eval_examples = read_squad_examples(
        input_file=args.predict_file,
        is_training=False,
        version_2_with_negative=args.version_2_with_negative)

    try:
        with open(cached_features_file, "rb") as reader:
            eval_features = pickle.load(reader)
    except:
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)
        with open(cached_features_file, "wb") as writer:
            pickle.dump(eval_features, writer)
# Create the tokenizer.
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)
#%%
# Load the configuration from file
bert_config = modeling.BertConfig.from_json_file(bert_config_file)

config = tf.compat.v1.ConfigProto(log_device_placement=True)

run_config = tf.estimator.RunConfig(model_dir=output_dir,
                                    session_config=config,
                                    save_checkpoints_steps=1000,
                                    keep_checkpoint_max=1)
#%%
# Read the training examples from the training file:
train_examples = run_squad.read_squad_examples(input_file=train_file,
                                               is_training=True)

num_train_steps = int(
    len(train_examples) / global_batch_size * num_train_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)

# Pre-shuffle the input to avoid having to make a very large shuffle
# buffer in in the `input_fn`.
rng = random.Random(12345)
rng.shuffle(train_examples)

start_index = 0
end_index = len(train_examples)
tmp_filenames = os.path.join(output_dir, "train.tf_record")

# We write to a temporary file to avoid storing very large constant tensors
Exemple #10
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  bert_config = rs.modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  rs.validate_flags_or_throw(bert_config)

  tf.gfile.MakeDirs(FLAGS.output_dir)

  tokenizer = rs.tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train:
    train_examples = rs.read_squad_examples(
        input_file=FLAGS.train_file, is_training=True)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # Pre-shuffle the input to avoid having to make a very large shuffle
    # buffer in in the `input_fn`.
    rng = random.Random(12345)
    rng.shuffle(train_examples)

  model_fn = rs.model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    # We write to a temporary file to avoid storing very large constant tensors
    # in memory.
    train_writer = rs.FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "train.tf_record"),
        is_training=True)
    rs.convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=True,
        output_fn=train_writer.process_feature)
    train_writer.close()

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num orig examples = %d", len(train_examples))
    tf.logging.info("  Num split examples = %d", train_writer.num_features)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    del train_examples

    train_input_fn = rs.input_fn_builder(
        input_file=train_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_predict:
    eval_examples = rs.read_squad_examples(
        input_file=FLAGS.predict_file, is_training=False)

    act_seq_len = get_act_seq_len(eval_examples, tokenizer, FLAGS.max_seq_length,
                    FLAGS.doc_stride, FLAGS.max_query_length)

    eval_writer = rs.FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "eval.tf_record"),
        is_training=False)
    eval_features = []

    def append_feature(feature):
      eval_features.append(feature)
      eval_writer.process_feature(feature)

    rs.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

    tf.logging.info("***** Running predictions *****")
    tf.logging.info("  Num orig examples = %d", len(eval_examples))
    tf.logging.info("  Num split examples = %d", len(eval_features))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    all_results = []

    predict_input_fn = rs.input_fn_builder(
        input_file=eval_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)
    
    # If running eval on the TPU, you will need to specify the number of
    # steps.
    all_results = []
    for idx, result in enumerate(estimator.predict(
        predict_input_fn, yield_single_examples=True)):
      if len(all_results) % 1000 == 0:
        tf.logging.info("Processing example: %d" % (len(all_results)))
      unique_id = int(result["unique_ids"])
      start_logits = [float(x) for x in result["start_logits"].flat]
      end_logits = [float(x) for x in result["end_logits"].flat]
      all_results.append(
          rs.RawResult(
              unique_id=unique_id,
              start_logits=start_logits[:act_seq_len[idx]],
              end_logits=end_logits[:act_seq_len[idx]]))

    output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json")
    output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json")

    rs.write_predictions(eval_examples, eval_features, all_results,
                         FLAGS.n_best_size, FLAGS.max_answer_length,
                         FLAGS.do_lower_case, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file)
def get_dataloader_fn(
    precision: str = 'fp32',
    batch_size: int = 8,
    vocab_file: str = "",
    do_lower_case: bool = True,
    predict_file: str = "",
    max_len: int = 512,
    max_seq_length: int = 384,
    doc_stride: int = 128,
    max_query_length: int = 64,
    version_2_with_negative: bool = False,
    pad_to_batch_size: bool = True,
):

    # Preprocess input data
    tokenizer = BertTokenizer(vocab_file,
                              do_lower_case=do_lower_case,
                              max_len=max_len)

    eval_examples = read_squad_examples(
        input_file=predict_file,
        is_training=False,
        version_2_with_negative=version_2_with_negative)
    eval_features = convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
    )

    # get inputs
    all_unique_ids = [f.unique_id for f in eval_features]
    all_input_ids = [f.input_ids for f in eval_features]
    all_input_mask = [f.input_mask for f in eval_features]
    all_segment_ids = [f.segment_ids for f in eval_features]

    if pad_to_batch_size:
        # each batch should have a fixed size
        f = eval_features[-1]
        padding = batch_size - (len(all_unique_ids) % batch_size)
        all_unique_ids += [f.unique_id for _ in range(padding)]
        all_input_ids += [f.input_ids for _ in range(padding)]
        all_input_mask += [f.input_mask for _ in range(padding)]
        all_segment_ids += [f.segment_ids for _ in range(padding)]

    all_unique_ids = torch.tensor(all_unique_ids,
                                  dtype=torch.int32,
                                  requires_grad=False)
    all_input_ids = torch.tensor(all_input_ids,
                                 dtype=torch.int32,
                                 requires_grad=False)
    all_input_mask = torch.tensor(all_input_mask,
                                  dtype=torch.int32,
                                  requires_grad=False)
    all_segment_ids = torch.tensor(all_segment_ids,
                                   dtype=torch.int32,
                                   requires_grad=False)
    eval_data = torch.utils.data.TensorDataset(all_unique_ids, all_input_ids,
                                               all_input_mask, all_segment_ids)
    eval_sampler = torch.utils.data.SequentialSampler(eval_data)
    eval_dataloader = torch.utils.data.DataLoader(
        eval_data,
        sampler=eval_sampler,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )

    dtype = {'fp32': np.float32, 'fp16': np.float16}
    dtype = dtype[precision]

    def _get_dataloader():
        """return dataloader for inference"""
        for unique_id, input_ids, input_mask, segment_ids in eval_dataloader:
            unique_id = unique_id.cpu().numpy()
            input_ids = input_ids.cpu().numpy()
            input_mask = input_mask.cpu().numpy()
            segment_ids = segment_ids.cpu().numpy()
            x = {
                "input__0": input_ids,
                "input__1": segment_ids,
                "input__2": input_mask
            }
            y_real = {
                "output__0": np.zeros([batch_size, max_seq_length],
                                      dtype=dtype),
                "output__1": np.zeros([batch_size, max_seq_length],
                                      dtype=dtype),
            }
            yield (unique_id, x, y_real)

    return _get_dataloader
Exemple #12
0
import json
import sys
sys.path.insert(0, '../bert')
from run_squad import read_squad_examples
import tokenization
from konlpy.tag import Mecab

if __name__ == "__main__":
    vocab = set()
    mecab = Mecab('../mecab-ko-dic-2.1.1-20180720')
    train_examples = read_squad_examples('./KorQuAD_v1.0_train.json',
                                         is_training=True)
    dev_examples = read_squad_examples('./KorQuAD_v1.0_dev.json',
                                       is_training=True)
    tokenizer = tokenization.FullTokenizer(vocab_file='./vocab.txt',
                                           do_lower_case=False)

    def add_to_vocab(vocab, tokenizer, examples):
        for (example_index, example) in enumerate(examples):
            query_tokens = tokenizer.tokenize(example.question_text)
            vocab |= set(query_tokens)
            for (i, token) in enumerate(example.doc_tokens):
                sub_tokens = tokenizer.tokenize(token)
                for sub_token in sub_tokens:
                    vocab |= set(sub_tokens)

    print("starting build vocab")
    add_to_vocab(vocab, tokenizer, train_examples)
    add_to_vocab(vocab, tokenizer, dev_examples)
    print("finished adding vocabs")
    with open('./vocab.txt', 'w') as file:
Exemple #13
0
    def get_dataset(self,
                    dataset_path,
                    is_training,
                    context_truncated_len=400,
                    utterance_truncated_len=100):
        examples = read_squad_examples(dataset_path, is_training)

        if self.ctx_emb == 'bert':
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif self.ctx_emb == 'xlnet':
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

        features = convert_examples_to_features(examples,
                                                tokenizer,
                                                max_seq_length=2500,
                                                doc_stride=2500,
                                                max_query_length=2500,
                                                is_training=is_training)

        with open(dataset_path) as f:
            raw_examples = json.load(f)

        # since problems are flatten by convert_examples_to_features
        index_feature = 0

        for example in tqdm(raw_examples['data']):
            for paragraph in example['paragraphs']:
                paragraph['context_raw'] = paragraph['context']

                # Since only `qa_feature.token_to_orig_map` (below) maps token
                # to space-splited-word-level indices in the context,
                # `word_offsets` is required to map space-splited-word-level
                # indices to char-level indices.
                word_offsets = [0]
                for word in paragraph['context'].split(' '):
                    word_offsets.append(len(word) + 1 + word_offsets[-1])

                for index_q, qa in enumerate(paragraph['qas']):
                    qa_feature = features[index_feature]
                    index_feature += 1
                    # in `features[index_feature].segment_ids`, question and
                    # context are concatenated. To seperate them, 0/1 stored
                    # in `segment_ids` are used.
                    question_len = qa_feature.segment_ids.index(1)
                    question = qa_feature.input_ids[:question_len]

                    if index_q == 0:  # do only once for a paragraph
                        context_len = \
                            qa_feature.segment_ids[question_len:].index(0)
                        context = (
                            # [question[0]]  # [CLS] token
                            qa_feature.input_ids[question_len:question_len +
                                                 context_len])
                        paragraph['context_offset'] = (
                            # [0]
                            [
                                word_offsets[qa_feature.token_to_orig_map[i]]
                                for i in range(question_len, question_len +
                                               context_len - 1)
                            ] + [len(paragraph['context'])])
                        paragraph['context_tokenized'] = qa_feature.input_ids
                        paragraph['context'] = context

                    qa['question_tokenized'] = tokenizer.tokenize(
                        qa['question'])
                    qa['question'] = question
                    qa['orig_answer_raw'] = qa['orig_answer']['text']
                    qa['orig_answer_text'] = tokenizer.tokenize(
                        qa['orig_answer_raw'])
                    qa['orig_answer_start'] = qa_feature.start_position - question_len
                    qa['orig_answer_end'] = qa_feature.end_position - question_len
                    assert qa['orig_answer_end'] < len(paragraph['context'])

                    # answer indicator for previous questions
                    qa['answer_indicator'] = [0] * context_len
                    for offset in range(1, min(3 + 1, index_q + 1)):
                        index_prev = index_q - offset
                        start, end = (
                            paragraph['qas'][index_prev]['orig_answer_start'],
                            paragraph['qas'][index_prev]['orig_answer_end'] +
                            1)
                        qa['answer_indicator'][start:end] = ([offset] *
                                                             (end - start))

                    if is_training:
                        for answer in qa['answers']:
                            answer['raw'] = answer['text']
                            answer['text'] = tokenizer.tokenize(answer['text'])

        return QuACDataset(raw_examples['data'],
                           context_truncated_len=context_truncated_len,
                           utterance_truncated_len=utterance_truncated_len,
                           padding=0)
Exemple #14
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--json_file",
        default=None,
        type=str,
        help=
        "predictions jsonfile location (output of run_squad). E.g., train-v1.1.json"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )
    parser.add_argument("--OG", action='store_true', help="test")

    args = parser.parse_args()

    with open(args.json_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)


#    if not os.path.exists(args.output_dir):
#        os.makedirs(args.output_dir)
    train_examples = run_squad.read_squad_examples(
        args.json_file, is_training=True, version_2_with_negative=True)
    max_seq_len = 384
    max_query_len = 64
    max_answer_len = 30

    exceed_seq_lens = []
    exceed_query_lens = []
    exceed_answer_lens = []

    exceed_seq_len_counter = 0
    exceed_query_len_counter = 0
    exceed_answer_len_counter = 0
    overall_counter = 0

    max_s = 0
    max_q = 0
    max_a = 0

    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased',
        do_lower_case=True)  # added_flag, currently hardcoded

    train_features = run_squad.convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=512,
        doc_stride=128,
        max_query_length=512,
        is_training=True)

    for example in train_features:
        overall_counter += 1
        if sum(example.input_mask) > max_seq_len:
            exceed_seq_lens.append(example.tokens)
            exceed_seq_len_counter += 1
        if sum(example.input_mask) > max_s:
            max_s = sum(example.input_mask)
        if sum(example.segment_ids_flipped) > max_query_len:
            exceed_query_lens.append(example.tokens)
            exceed_query_len_counter += 1
        if sum(example.segment_ids_flipped) > max_q:
            max_q = sum(example.segment_ids_flipped)
        if (example.end_position - example.start_position) > max_answer_len:
            exceed_answer_len_counter += 1
            exceed_answer_lens.append(example.tokens)
        if (example.end_position - example.start_position) > max_a:
            max_a = (example.end_position - example.start_position)

    print("Number of examples: %d." % overall_counter)
    print("Number of sequences that exceeded max_seq_len of %d is %d." %
          (max_seq_len, exceed_seq_len_counter))
    print("Number of queries that exceeded max_query_len of %d is %d." %
          (max_query_len, exceed_query_len_counter))
    print("Number of answers that exceeded max_answer_len of %d is %d." %
          (max_answer_len, exceed_answer_len_counter))
    print("Max seq length found was %d." % max_s)
    print("Max query length found was %d." % max_q)
    print("Max answer length found was %d." % max_a)