Beispiel #1
0
def get_generator_config(config, bert_config):
    """Get model config for the generator network."""
    gen_config = ElectraConfig.from_dict(bert_config.to_dict())
    gen_config.hidden_size = int(
        round(bert_config.hidden_size * config.generator_hidden_size))
    gen_config.num_hidden_layers = int(
        round(bert_config.num_hidden_layers * config.generator_layers))
    gen_config.intermediate_size = 4 * gen_config.hidden_size
    gen_config.num_attention_heads = max(1, gen_config.hidden_size // 64)
    return gen_config
Beispiel #2
0
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # Set up model config
        self._config = config
        self.disc_config = ElectraConfig(
            vocab_size=30522,
            embedding_size=768,
            hidden_size=768,
            num_hidden_layers=12,
            num_attention_heads=12,
            intermediate_size=3072,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
        )
        if config.debug:
            self.disc_config.num_hidden_layers = 3
            self.disc_config.hidden_size = 128
            self.disc_config.intermediate_size = 128 * 4
            self.disc_config.num_attention_heads = 4

        # Set up discriminator
        self.discriminator = TFElectraForPreTraining(self.disc_config)

        # Set up generator
        gen_config = get_generator_config(config, self.disc_config)
        if config.electra_objective:
            if config.shared_embeddings:
                self.generator = TFElectraForMaskedLM(
                    gen_config,
                    shared_embeddings=True,
                    input_embeddings=self.discriminator.get_input_embeddings())
            else:
                self.generator = TFElectraForMaskedLM(gen_config)
        else:
            self.generator = TFElectraForMaskedLM(self.disc_config)
def main():
    args = parse_args()

    hvd.init()
    set_affinity(hvd.local_rank())

    if is_main_process():
        log("Running total processes: {}".format(get_world_size()))
    log("Starting process: {}".format(get_rank()))

    if is_main_process():
        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                           filename=args.json_summary),
                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
    else:
        dllogger.init(backends=[])

    tf.random.set_seed(args.seed)
    dllogger.log(step="PARAMETER", data={"SEED": args.seed})
    # script parameters
    BATCH_SIZE = args.train_batch_size
    EVAL_BATCH_SIZE = args.predict_batch_size
    USE_XLA = args.xla
    USE_AMP = args.amp
    EPOCHS = args.num_train_epochs

    if not args.do_train:
        EPOCHS = args.num_train_epochs = 1
        log("Since running inference only, setting args.num_train_epochs to 1")

    if not os.path.exists(args.output_dir) and is_main_process():
        os.makedirs(args.output_dir)

    # TensorFlow configuration
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
    tf.config.optimizer.set_jit(USE_XLA)
    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
    
    if args.amp:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
        tf.keras.mixed_precision.experimental.set_policy(policy)
        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32

    if is_main_process():
        log("***** Loading tokenizer and model *****")
    # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
    electra_model = args.electra_model
    config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir)
    config.update({"amp": args.amp})
    if args.vocab_file is None:
        tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir)
    else:
        tokenizer = ElectraTokenizer(
            vocab_file=args.vocab_file,
            do_lower_case=args.do_lower_case)

    model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args)

    if is_main_process():
        log("***** Loading dataset *****")
    # Load data
    processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
    train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None
    dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None

    if is_main_process():
        log("***** Loading features *****")
    # Load cached features
    squad_version = '2.0' if args.version_2_with_negative else '1.1'
    if args.cache_dir is None:
        args.cache_dir = args.data_dir
    cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format(
        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length), squad_version)
    cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format(
        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length), squad_version)

    try:
        with open(cached_train_features_file, "rb") as reader:
            train_features = pickle.load(reader) if args.do_train else []
        with open(cached_dev_features_file, "rb") as reader:
            dev_features = pickle.load(reader) if args.do_predict else []
    except:
        train_features = (  # TODO: (yy) do on rank 0?
            squad_convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True,
                return_dataset="",
            )
            if args.do_train
            else []
        )
        dev_features = (
            squad_convert_examples_to_features(
                examples=dev_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=False,
                return_dataset="",
            )
            if args.do_predict
            else []
        )
        # Dump Cached features
        if not args.skip_cache and is_main_process():
            if args.do_train:
                log("***** Building Cache Files: {} *****".format(cached_train_features_file))
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
            if args.do_predict:
                log("***** Building Cache Files: {} *****".format(cached_dev_features_file))
                with open(cached_dev_features_file, "wb") as writer:
                    pickle.dump(dev_features, writer)

    len_train_features = len(train_features)
    total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1
    train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1
    len_dev_features = len(dev_features)
    total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1

    train_dataset = get_dataset_from_features(train_features, BATCH_SIZE,
                                              v2=args.version_2_with_negative) if args.do_train else []
    dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev",
                                            v2=args.version_2_with_negative) if args.do_predict else []

    opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps,
                           num_warmup_steps=int(args.warmup_proportion * total_train_steps),
                           weight_decay_rate=args.weight_decay_rate,
                           layerwise_lr_decay=args.layerwise_lr_decay,
                           n_transformer_layers=model.num_hidden_layers)
    if USE_AMP:
        # loss scaling is currently required when using mixed precision
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")

    # Define loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    loss_class = tf.keras.losses.BinaryCrossentropy(
        from_logits=True,
        name='binary_crossentropy'
    )
    metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
    model.compile(optimizer=opt, loss=loss, metrics=[metric])
    train_loss_results = []

    if args.do_train and is_main_process():
        log("***** Running training *****")
        log("  Num examples = ", len_train_features)
        log("  Num Epochs = ", args.num_train_epochs)
        log("  Instantaneous batch size per GPU = ", args.train_batch_size)
        log(
            "  Total train batch size (w. parallel, distributed & accumulation) = ",
            args.train_batch_size
            * get_world_size(),
        )
        log("  Total optimization steps =", total_train_steps)

    total_train_time = 0
    latency = []
    for epoch in range(EPOCHS):
        if args.do_train:
            epoch_loss_avg = tf.keras.metrics.Mean()
            epoch_perf_avg = tf.keras.metrics.Mean()
            epoch_start = time.time()

            epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5,
                                  disable=not is_main_process())
            for iter, inputs in enumerate(epoch_iterator):
                # breaking criterion if max_steps if > 1
                if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps:
                    break
                iter_start = time.time()
                # Optimize the model
                loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0),
                                        v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP)
                epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start))
                if iter % args.log_freq == 0:
                    if is_main_process():
                        log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value,
                                                                                              epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1,
                                                                                              int(opt.iterations)))
                    dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()),
                                                            "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())})

                # Track progress
                epoch_loss_avg.update_state(loss_value)  # Add current batch loss

            # End epoch
            train_loss_results.append(epoch_loss_avg.result())
            total_train_time += float(time.time() - epoch_start)
            # Summarize and save checkpoint at the end of each epoch
            if is_main_process():

                dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time,
                                                 "training_sequences_per_second": float(
                                                     epoch_perf_avg.result().numpy() * get_world_size()),
                                                 "final_loss": float(epoch_loss_avg.result().numpy())})

            if not args.skip_checkpoint:
                if args.ci:
                    checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1)
                else:
                    checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1)
                if is_main_process():
                    model.save_weights(checkpoint_name)


        if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1):
            if not args.do_train:
                log("***** Loading checkpoint: {} *****".format(args.init_checkpoint))
                model.load_weights(args.init_checkpoint).expect_partial()

            current_feature_id = 0
            all_results = []
            if is_main_process():
                log("***** Running evaluation *****")
                log("  Num Batches = ", total_dev_steps)
                log("  Batch size = ", args.predict_batch_size)

            raw_infer_start = time.time()
            if is_main_process():
                infer_perf_avg = tf.keras.metrics.Mean()
                dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5,
                                    disable=not is_main_process())
                for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator:
                    # training=False is needed only if there are layers with different
                    # behavior during training versus inference (e.g. Dropout).

                    iter_start = time.time()

                    if not args.joint_head:
                        batch_start_logits, batch_end_logits = infer_step(model, input_ids,
                                                                          attention_mask=input_mask,
                                                                          token_type_ids=segment_ids,
                                                                          )[:2]
                        #Synchronize with GPU to compute time
                        _ = batch_start_logits.numpy()
                                                            
                    else:
                        
                        outputs = infer_step(model, input_ids,
                                             attention_mask=input_mask,
                                             token_type_ids=segment_ids,
                                             cls_index=cls_index,
                                             p_mask=p_mask,
                                             )
                        #Synchronize with GPU to compute time
                        _ = outputs[0].numpy()

                    infer_time = (time.time() - iter_start)
                    infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time)
                    latency.append(infer_time)

                    for iter_ in range(input_ids.shape[0]):

                        if not args.joint_head:
                            start_logits = batch_start_logits[iter_].numpy().tolist()
                            end_logits = batch_end_logits[iter_].numpy().tolist()
                            dev_feature = dev_features[current_feature_id]
                            current_feature_id += 1
                            unique_id = int(dev_feature.unique_id)
                            all_results.append(RawResult(unique_id=unique_id,
                                                         start_logits=start_logits,
                                                         end_logits=end_logits))
                        else:
                            dev_feature = dev_features[current_feature_id]
                            current_feature_id += 1
                            unique_id = int(dev_feature.unique_id)
                            output = [output[iter_].numpy().tolist() for output in outputs]

                            start_logits = output[0]
                            start_top_index = output[1]
                            end_logits = output[2]
                            end_top_index = output[3]
                            cls_logits = output[4]
                            result = SquadResult(
                                unique_id,
                                start_logits,
                                end_logits,
                                start_top_index=start_top_index,
                                end_top_index=end_top_index,
                                cls_logits=cls_logits,
                            )

                            all_results.append(result)

                # Compute and save predictions
                answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args)

                output_prediction_file = os.path.join(args.output_dir, "predictions.json")
                output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
                e2e_infer_time = time.time() - raw_infer_start
                # if args.version_2_with_negative:
                #     output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
                # else:
                #     output_null_log_odds_file = None
                with open(output_prediction_file, "w") as f:
                    f.write(json.dumps(answers, indent=4) + "\n")
                with open(output_nbest_file, "w") as f:
                    f.write(json.dumps(nbest_answers, indent=4) + "\n")

                if args.do_eval:
                    if args.version_2_with_negative:
                        dev_file = "dev-v2.0.json"
                    else:
                        dev_file = "dev-v1.1.json"

                    eval_out = subprocess.check_output([sys.executable, args.eval_script,
                                                        args.data_dir + "/" + dev_file, output_prediction_file])
                    log(eval_out.decode('UTF-8'))
                    scores = str(eval_out).strip()
                    exact_match = float(scores.split(":")[1].split(",")[0])
                    if args.version_2_with_negative:
                        f1 = float(scores.split(":")[2].split(",")[0])
                    else:
                        f1 = float(scores.split(":")[2].split("}")[0])

                    log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8')))
                    log("**EVAL SUMMARY** - Epoch: {:03d},  EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s"
                          .format(epoch, exact_match, f1, infer_perf_avg.result()))

                latency_all = sorted(latency)[:-2]
                log(
                    "**LATENCY SUMMARY** - Epoch: {:03d},  Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms"
                    .format(epoch, sum(latency_all) / len(latency_all) * 1000,
                            sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000,
                            sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000,
                            sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000,
                            ))
                dllogger.log(step=tuple(),
                             data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), 
                                   "e2e_inference_time": e2e_infer_time})

    if is_main_process() and args.do_train and args.do_eval:
        log(
            "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s"
            .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(),
                    infer_perf_avg.result()))
        dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
Beispiel #4
0
def main():
    args = parse_args()
    print("***** Loading tokenizer and model *****")
    electra_model = args.electra_model
    config = ElectraConfig.from_pretrained(electra_model)
    tokenizer = ElectraTokenizer.from_pretrained(electra_model)
    model = TFElectraForQuestionAnswering.from_pretrained(electra_model,
                                                          config=config,
                                                          args=args)

    print("***** Loading fine-tuned checkpoint: {} *****".format(
        args.init_checkpoint))
    model.load_weights(args.init_checkpoint,
                       by_name=False,
                       skip_mismatch=False).expect_partial()

    question, text = args.question, args.context
    encoding = tokenizer.encode_plus(question, text, return_tensors='tf')
    input_ids, token_type_ids, attention_mask = encoding["input_ids"], encoding["token_type_ids"], \
                                                encoding["attention_mask"]
    all_tokens = tokenizer.convert_ids_to_tokens(input_ids.numpy()[0])
    if not args.joint_head:
        start_logits, end_logits = model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )[:2]
        start_logits = start_logits[0].numpy().tolist()
        end_logits = end_logits[0].numpy().tolist()
        result = RawResult(unique_id=0,
                           start_logits=start_logits,
                           end_logits=end_logits)

        start_indices = _get_best_indices(result.start_logits,
                                          args.n_best_size)
        end_indices = _get_best_indices(result.end_logits, args.n_best_size)
        predictions = get_predictions(start_indices, end_indices, result,
                                      len(all_tokens), args)
        null_score = result.start_logits[0] + result.end_logits[0]

    else:
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
        output = [output[0].numpy().tolist() for output in outputs]
        start_logits = output[0]
        start_top_index = output[1]
        end_logits = output[2]
        end_top_index = output[3]
        cls_logits = output[4]
        result = SquadResult(
            0,
            start_logits,
            end_logits,
            start_top_index=start_top_index,
            end_top_index=end_top_index,
            cls_logits=cls_logits,
        )
        predictions = get_predictions_joint_head(result.start_top_index,
                                                 result.end_top_index, result,
                                                 len(all_tokens), args)
        null_score = result.cls_logits

    predictions = sorted(predictions,
                         key=lambda x: (x.start_logit + x.end_logit),
                         reverse=True)
    answer = predictions[0]
    answer = ' '.join(all_tokens[answer.start_index:answer.end_index + 1])
    if args.null_score_diff_threshold > null_score and args.version_2_with_negative:
        answer = ''

    print(answer)

    return answer