def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": run_classifier.ColaProcessor,
        "mnli": run_classifier.MnliProcessor,
        "mrpc": run_classifier.MrpcProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        num_labels=len(label_list),
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        bert_hub_module_handle=FLAGS.bert_hub_module_handle)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_features = run_classifier.convert_examples_to_features(
            train_examples, label_list, FLAGS.max_seq_length, tokenizer)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = run_classifier.input_fn_builder(
            features=train_features,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_features = run_classifier.convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            # Eval will be slightly WRONG on the TPU because it will truncate
            # the last batch.
            eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = run_classifier.input_fn_builder(
            features=eval_features,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        if FLAGS.use_tpu:
            # Discard batch remainder if running on TPU
            n = len(predict_examples)
            predict_examples = predict_examples[:(
                n - n % FLAGS.predict_batch_size)]

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=FLAGS.use_tpu)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            tf.logging.info("***** Predict results *****")
            for prediction in result:
                probabilities = prediction["probabilities"]
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=learning_rate,
                     warmup=warmup_proportion,
                     t_total=t_total)
# optimizer = AdamW(optimizer_grouped_parameters,
#                   lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
#                   correct_bias=False
#                 )

# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)  # PyTorch scheduler

# In[15]:

global_step = 0
train_features = convert_examples_to_features(train_examples, label_list,
                                              max_seq_length, tokenizer)
claim_features = convert_claims_to_features(train_examples, label_list,
                                            max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                               dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features],
                             dtype=torch.long)
def train_and_test(data_dir,
                   bert_model="bert-base-uncased",
                   task_name=None,
                   output_dir=None,
                   max_seq_length=32,
                   do_train=False,
                   do_eval=False,
                   do_lower_case=False,
                   train_batch_size=32,
                   eval_batch_size=8,
                   learning_rate=5e-5,
                   num_train_epochs=5,
                   warmup_proportion=0.1,
                   no_cuda=False,
                   local_rank=-1,
                   seed=42,
                   gradient_accumulation_steps=1,
                   optimize_on_cpu=False,
                   fp16=False,
                   loss_scale=128,
                   saved_model=""):

    # ## Required parameters
    # parser.add_argument("--data_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    # parser.add_argument("--bert_model", default=None, type=str, required=True,
    #                     help="Bert pre-trained model selected in the list: bert-base-uncased, "
    #                          "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    # parser.add_argument("--task_name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The name of the task to train.")
    # parser.add_argument("--output_dir",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    # parser.add_argument("--max_seq_length",
    #                     default=128,
    #                     type=int,
    #                     help="The maximum total input sequence length after WordPiece tokenization. \n"
    #                          "Sequences longer than this will be truncated, and sequences shorter \n"
    #                          "than this will be padded.")
    # parser.add_argument("--do_train",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run training.")
    # parser.add_argument("--do_eval",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_lower_case",
    #                     default=False,
    #                     action='store_true',
    #                     help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--train_batch_size",
    #                     default=32,
    #                     type=int,
    #                     help="Total batch size for training.")
    # parser.add_argument("--eval_batch_size",
    #                     default=8,
    #                     type=int,
    #                     help="Total batch size for eval.")
    # parser.add_argument("--learning_rate",
    #                     default=5e-5,
    #                     type=float,
    #                     help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs",
    #                     default=3.0,
    #                     type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--warmup_proportion",
    #                     default=0.1,
    #                     type=float,
    #                     help="Proportion of training to perform linear learning rate warmup for. "
    #                          "E.g., 0.1 = 10%% of training.")
    # parser.add_argument("--no_cuda",
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether not to use CUDA when available")
    # parser.add_argument("--local_rank",
    #                     type=int,
    #                     default=-1,
    #                     help="local_rank for distributed training on gpus")
    # parser.add_argument('--seed',
    #                     type=int,
    #                     default=42,
    #                     help="random seed for initialization")
    # parser.add_argument('--gradient_accumulation_steps',
    #                     type=int,
    #                     default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument('--optimize_on_cpu',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to perform optimization and keep the optimizer averages on CPU")
    # parser.add_argument('--fp16',
    #                     default=False,
    #                     action='store_true',
    #                     help="Whether to use 16-bit float precision instead of 32-bit")
    # parser.add_argument('--loss_scale',
    #                     type=float, default=128,
    #                     help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    # args = parser.parse_args()

    processors = {
        #         "cola": ColaProcessor,
        #         "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "stance": StanceProcessor
    }

    if local_rank == -1 or no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if do_train:
        #         if os.path.exists(output_dir) and os.listdir(output_dir):
        #             raise ValueError("Output directory ({}) already exists and is not emp1ty.".format(output_dir))
        os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    #     tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(data_dir)
        num_train_steps = int(
            len(train_examples) / train_batch_size /
            gradient_accumulation_steps * num_train_epochs)

        # Prepare model
        #     model = BertForSequenceClassification.from_pretrained(bert_model,
        #                 cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2)

        model = BertForConsistencyCueClassification.from_pretrained(
            'bert-base-uncased', num_labels=2)
        model.to(device)
        if fp16:
            model.half()

        if local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        if fp16:
            param_optimizer = [
                (n, param.clone().detach().to('cpu').float().requires_grad_())
                for n, param in model.named_parameters()
            ]
        elif optimize_on_cpu:
            param_optimizer = [
                (n, param.clone().detach().to('cpu').requires_grad_())
                for n, param in model.named_parameters()
            ]
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
        t_total = num_train_steps
#     print(t_total)
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if do_train:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=warmup_proportion,
                             t_total=t_total)

    global_step = 0
    if do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=train_batch_size)

        model.train()
        for _ in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

        torch.save(model.state_dict(),
                   output_dir + "ibmcs_non_reverse_bertcons_epoch5.pth")

    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_test_examples(data_dir)
        #         eval_examples = processor.get_dev_examples(data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     max_seq_length, tokenizer)
        claim_features = convert_claims_to_features(eval_examples, label_list,
                                                    max_seq_length, tokenizer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)

        claims_input_ids = torch.tensor([f.input_ids for f in claim_features],
                                        dtype=torch.long)
        claims_input_mask = torch.tensor(
            [f.input_mask for f in claim_features], dtype=torch.long)
        claims_segment_ids = torch.tensor(
            [f.segment_ids for f in claim_features], dtype=torch.long)
        claims_label_ids = torch.tensor([f.label_id for f in claim_features],
                                        dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  claims_input_ids, claims_input_mask,
                                  claims_segment_ids, claims_label_ids)
        # Run prediction for full data
        #         eval_sampler = SequentialSampler(eval_data)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=eval_batch_size)
        #         print('all_input_ids:')
        #         print(all_input_ids)

        #         model.load_state_dict(torch.load(saved_model))
        model_state_dict = torch.load(saved_model)
        model = BertForConsistencyCueClassification.from_pretrained(
            'bert-base-uncased', num_labels=2, state_dict=model_state_dict)
        model.to(device)

        model.eval()
        # eval_loss, eval_accuracy = 0, 0

        eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0
        eval_loss, eval_macro_p, eval_macro_r = 0, 0, 0

        raw_score = []

        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            claim_input_ids = claim_input_ids.to(device)
            claim_input_mask = claim_input_mask.to(device)
            claim_segment_ids = claim_segment_ids.to(device)
            claim_label_ids = claim_label_ids.to(device)

            #             print("start")
            #             print(input_ids)
            #             print(input_mask)
            #             print(segment_ids)
            #             print(label_ids)
            #             print(claim_input_ids)
            #             print(claim_input_mask)
            #             print(claim_segment_ids)
            #             print(claim_label_ids)
            #             print("end")
            with torch.no_grad():
                tmp_eval_loss = model(input_ids=input_ids,
                                      token_type_ids=segment_ids,
                                      attention_mask=input_mask,
                                      labels=label_ids,
                                      input_ids2=claim_input_ids,
                                      token_type_ids2=claim_segment_ids,
                                      attention_mask2=claim_input_mask,
                                      labels2=claim_label_ids)

                logits = model(input_ids=input_ids,
                               token_type_ids=segment_ids,
                               attention_mask=input_mask,
                               input_ids2=claim_input_ids,
                               token_type_ids2=claim_segment_ids,
                               attention_mask2=claim_input_mask)

#             print(logits)
#             print(logits[0])
            logits = logits.detach().cpu().numpy()
            #             print(logits)
            label_ids = label_ids.to('cpu').numpy()
            #             print(label_ids)

            # Micro F1 (aggregated tp, fp, fn counts across all examples)
            tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(
                logits, label_ids)
            eval_tp += tmp_tp
            eval_pred_c += tmp_pred_c
            eval_gold_c += tmp_gold_c

            pred_label = np.argmax(logits, axis=1)
            raw_score += zip(logits, pred_label, label_ids)

            # Macro F1 (averaged P, R across mini batches)
            tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids)

            eval_macro_p += tmp_eval_p
            eval_macro_r += tmp_eval_r

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        # Micro F1 (aggregated tp, fp, fn counts across all examples)
        eval_micro_p = eval_tp / eval_pred_c
        eval_micro_r = eval_tp / eval_gold_c
        eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p +
                                                           eval_micro_r)

        # Macro F1 (averaged P, R across mini batches)
        eval_macro_p = eval_macro_p / nb_eval_steps
        eval_macro_r = eval_macro_r / nb_eval_steps
        eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p +
                                                           eval_macro_r)

        eval_loss = eval_loss / nb_eval_steps
        result = {
            'eval_loss': eval_loss,
            'eval_micro_p': eval_micro_p,
            'eval_micro_r': eval_micro_r,
            'eval_micro_f1': eval_micro_f1,
            'eval_macro_p': eval_macro_p,
            'eval_macro_r': eval_macro_r,
            'eval_macro_f1': eval_macro_f1,
            #                   'global_step': global_step,
            #                   'loss': tr_loss/nb_tr_steps
        }

        output_eval_file = os.path.join(
            output_dir,
            "train_on_ibmcs_eval_on_ibmcs_bert_cons_epoch5_eval_results.txt")
        output_raw_score = os.path.join(
            output_dir,
            "train_on_ibmcs_eval_on_ibmcs_bert_cons_epoch5_raw_score.csv")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        with open(output_raw_score, 'w') as fout:
            fields = [
                "undermine_score", "support_score", "predict_label", "gold"
            ]
            writer = csv.DictWriter(fout, fieldnames=fields)
            writer.writeheader()
            for score, pred, gold in raw_score:
                writer.writerow({
                    "undermine_score": str(score[0]),
                    "support_score": str(score[1]),
                    "predict_label": str(pred),
                    "gold": str(gold)
                })
Example #4
0
def main(_):
    """For current work, we only use the following four categories, 
    but you can add others if you would like to."""
    categories = [
        'Tools_and_Home_Improvement',
        'Patio_Lawn_and_Garden',
        'Electronics',
        'Baby',
    ]

    models = ['FLTR', 'BertQA']

    data_path = os.path.join(FLAGS.data_dir, 'Annotated_Data.txt')
    data = pd.read_csv(data_path,
                       sep='\t',
                       encoding='utf-8',
                       converters={
                           'annotation_score': ast.literal_eval,
                           'reviews': ast.literal_eval
                       })
    data = data.reset_index()
    data['qr'] = data[['index', 'question', 'reviews'
                       ]].apply(lambda x: [[x['index'], x['question'], i]
                                           for i in x['reviews']],
                                axis=1)

    d = []
    for category in categories:
        qr = data[data['category'] == category]['qr'].tolist()
        qr = [item for sublist in qr for item in sublist]
        qr = pd.DataFrame(columns=['index', 'question', 'review'], data=qr)
        qr['label'] = 1

        temp = qr.copy()
        temp['question'] = temp['question'].apply(str)
        temp['review'] = temp['review'].apply(str)
        DATA_COLUMN_A = 'question'
        DATA_COLUMN_B = 'review'
        LABEL_COLUMN = 'label'
        label_list = [0, 1]

        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                               do_lower_case=True)

        test_InputExamples = temp.apply(
            lambda x: run_classifier.InputExample(guid=None,
                                                  text_a=x[DATA_COLUMN_A],
                                                  text_b=x[DATA_COLUMN_B],
                                                  label=x[LABEL_COLUMN]),
            axis=1)

        test_features = run_classifier.convert_examples_to_features(
            test_InputExamples, label_list, FLAGS.max_seq_length, tokenizer)

        t = data[data['category'] == category]
        t = t.reset_index(drop=True)
        for model in models:

            OUTPUT_DIR = os.path.join(FLAGS.model_output_dir,
                                      category + '_' + model)
            run_config = tf.estimator.RunConfig(model_dir=OUTPUT_DIR,
                                                save_summary_steps=100,
                                                save_checkpoints_steps=100)

            model_fn = None
            if model == 'BertQA':
                model_fn = model_fn_builder_BertQA(
                    bert_config=modeling.BertConfig.from_json_file(
                        FLAGS.bert_config_file),
                    num_labels=len(label_list),
                    init_checkpoint=OUTPUT_DIR,
                    learning_rate=FLAGS.learning_rate,
                    num_train_steps=100,
                    num_warmup_steps=100)
            else:
                model_fn = model_fn_builder_FLTR(
                    bert_config=modeling.BertConfig.from_json_file(
                        FLAGS.bert_config_file),
                    num_labels=len(label_list),
                    init_checkpoint=OUTPUT_DIR,
                    learning_rate=FLAGS.learning_rate,
                    num_train_steps=100,
                    num_warmup_steps=100)

            estimator = tf.estimator.Estimator(
                model_fn=model_fn,
                config=run_config,
                params={"batch_size": FLAGS.train_batch_size})

            test_input_fn = run_classifier.input_fn_builder(
                features=test_features,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=False)

            predictions = estimator.predict(test_input_fn)
            probabilities = [
                prediction['probabilities'] for prediction in predictions
            ]
            probabilities = [list(item) for item in probabilities]

            if model == 'FLTR':
                probabilities = [item[1] for item in probabilities]
            else:
                probabilities = [item[0] for item in probabilities]

            print(model, ' :', probabilities[:10])
            temp[model + '_score'] = probabilities
            temp_groupby = temp.groupby(
                ['index', 'question'],
                sort=False)[model + '_score'].apply(list).reset_index(
                    name=model + '_score')
            t = pd.concat([t, temp_groupby[model + '_score']], axis=1)

        if len(d) == 0:
            d = t
        else:
            d = pd.concat([d, t], axis=0, ignore_index=True)

    d.to_csv(os.path.join(FLAGS.data_dir, 'test_predictions.txt'),
             index=None,
             sep='\t',
             mode='w')
Example #5
0
def main(argv):

    BERT_MODEL = 'uncased_L-12_H-768_A-12'
    VOCAB_FILE = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/vocab.txt'
    CONFIG_FILE = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/bert_config.json'
    INIT_CHECKPOINT = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/bert_model.ckpt'
    DO_LOWER_CASE = BERT_MODEL.startswith('uncased')
    model_dir = "{}/{}".format("/opt/tftuner", mltunerUtil.get_job_id())

    # model fix parameter
    TRAIN_BATCH_SIZE = mltunerUtil.get_batch_size()
    NUM_TRAIN_EPOCHS = 3
    LEARNING_RATE = mltunerUtil.get_learning_rate()
    WARMUP_PROPORTION = 0.05
    EVAL_BATCH_SIZE = 8
    MAX_SEQ_LENGTH = 128

    #data loading
    train_df = pd.read_csv(
        '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/train.csv')
    train_df = train_df.sample(1000)
    train, test = train_test_split(train_df, test_size=0.1, random_state=42)
    train_lines, train_labels = train.question_text.values, train.target.values
    test_lines, test_labels = test.question_text.values, test.target.values
    label_list = ['0', '1']
    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                           do_lower_case=DO_LOWER_CASE)
    train_examples = create_examples(train_lines, 'train', labels=train_labels)

    num_train_steps = int(
        len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    strategy = tf.distribute.experimental.ParameterServerStrategy()
    session_config = mltunerUtil.get_tf_session_config()
    config = tf.compat.v1.estimator.tpu.RunConfig(
        train_distribute=strategy,
        model_dir=model_dir,
        save_checkpoints_steps=None,
        save_checkpoints_secs=None,
        session_config=session_config)

    model_fn = run_classifier.model_fn_builder(
        bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
        num_labels=len(label_list),
        init_checkpoint=None,
        learning_rate=LEARNING_RATE,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=
        False,  #If False training will fall on CPU or GPU, depending on what is available  
        use_one_hot_embeddings=True)

    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
        use_tpu=
        False,  #If False training will fall on CPU or GPU, depending on what is available 
        model_fn=model_fn,
        config=config,
        train_batch_size=TRAIN_BATCH_SIZE,
        eval_batch_size=EVAL_BATCH_SIZE)

    class LoggerHook(tf.estimator.SessionRunHook):
        """Logs loss and runtime."""
        def __init__(self):
            self.last_run_timestamp = time.time()

        def after_run(self, run_context, run_values):
            session: tf.Session = run_context.session
            loss, step = session.run([
                tf.compat.v1.get_collection("losses")[0],
                tf.compat.v1.get_collection("global_step_read_op_cache")[0]
            ])
            logging.debug("step:{} loss:{}".format(step, loss))
            mltunerUtil.report_iter_loss(step, loss,
                                         time.time() - self.last_run_timestamp)
            self.last_run_timestamp = time.time()

    # prepare for train
    train_features = run_classifier.convert_examples_to_features(
        train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    train_input_fn = input_fn_builder(features=train_features,
                                      seq_length=MAX_SEQ_LENGTH,
                                      is_training=True,
                                      drop_remainder=True)

    predict_examples = create_examples(test_lines, 'test')
    predict_features = run_classifier.convert_examples_to_features(
        predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = input_fn_builder(features=predict_features,
                                        seq_length=MAX_SEQ_LENGTH,
                                        is_training=False,
                                        drop_remainder=False)

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=num_train_steps,
                                        hooks=[LoggerHook()])
    eval_spec = tf.estimator.EvalSpec(input_fn=predict_input_fn)

    # wait for chief ready?
    if not (mltunerUtil.is_chief() or mltunerUtil.is_ps()):
        time.sleep(1)
        if not tf.io.gfile.exists(model_dir):
            logging.debug("wait for chief init")
            time.sleep(1)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Example #6
0
model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=True)

estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False,
                                        model_fn=model_fn,
                                        config=run_config,
                                        train_batch_size=TRAIN_BATCH_SIZE,
                                        eval_batch_size=EVAL_BATCH_SIZE)

# estimator = tf.contrib.estimator.SavedModelEstimator(BERT_PRETRAINED_DIR)

# Eval the model.
eval_examples = processor.get_dev_examples(TASK_DATA_DIR)
eval_features = run_classifier.convert_examples_to_features(
    eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)
eval_input_fn = run_classifier.input_fn_builder(features=eval_features,
                                                seq_length=MAX_SEQ_LENGTH,
                                                is_training=False,
                                                drop_remainder=True)
result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

print(result)
Example #7
0
LABEL_COLUMN = 'label'
label_list = [0, 1]

# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                        text_a = x[DATA_COLUMN_A],
                                                                        text_b = x[DATA_COLUMN_B],
                                                                        label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None,
                                                                      text_a = x[DATA_COLUMN_A],
                                                                      text_b = x[DATA_COLUMN_B],
                                                                      label = x[LABEL_COLUMN]), axis = 1)

# Convert our train and test features to InputFeatures that BERT understands.
train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
                                    model_dir=OUTPUT_DIR,
                                    save_summary_steps=SAVE_SUMMARY_STEPS,
                                    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

model_fn = model_fn_builder(
                            num_labels=len(label_list),
Example #8
0
def main(_):
    """For current work, we only use the following four categories, 
    but you can add others if you would like to."""
    categories = [
              'Tools_and_Home_Improvement', 
              'Patio_Lawn_and_Garden',
              'Electronics',
              'Baby',
    ]
    
    data = None
    
    """Cross-domain pre-training (All_Categories) to boost the performance."""
    if FLAGS.category_name == "All_Categories":
        for category in categories:
            category_data_path = os.path.join(FLAGS.data_dir,category+'.txt')
            category_data = pd.read_csv(category_data_path,sep='\t',encoding='utf-8',nrows=10000,
                  converters={'reviewText':ast.literal_eval,'FLTR_scores':ast.literal_eval})
            if data is None:
                data = category_data
            else:
                data = pd.concat([data,category_data],axis=0)
        data = data.sample(n=len(data))
    else:
        data_path = os.path.join(FLAGS.data_dir,FLAGS.category_name+'.txt')
        data = pd.read_csv(data_path,sep='\t',encoding='utf-8',#nrows=10000,
                  cconverters={'reviewText':ast.literal_eval,'FLTR_scores':ast.literal_eval})
    
    #data['len_questions'] = data["question"].apply(lambda x: len(x.split()))
    #data = data[data['len_questions']<=10]
    
    data['FLTR_Top10'] = data.apply(FLTR_Top10,axis=1)
    list_of_answers = list(data['answer'])
    list_of_answers=shuffle(list_of_answers)
    data['non_answer']= list_of_answers
    
    train = data[:int(len(data)*0.8)]
    train = train.sample(n=min(20000,len(train)))
    test = data[int(len(data)*0.8):]
    print(train.shape,test.shape)

    DATA_COLUMN_A = 'senA'
    DATA_COLUMN_B = 'senB'
    LABEL_COLUMN = 'Label'
    label_list = [0, 1]
         
    train = train.apply(qar_pair,axis=1)    
    test = test.apply(qar_pair,axis=1)
    
    temp = train.tolist()
    flat_list = [item for sublist in temp for item in sublist]
    train =pd.DataFrame(flat_list,columns=['senA','senB'])
    train['Label'] =1
    train['senA']=train['senA'].apply(str)
    train['senB']=train['senB'].apply(str)
    
    temp = test.tolist()
    flat_list = [item for sublist in temp for item in sublist]
    test = pd.DataFrame(flat_list,columns=['senA','senB'])
    test['Label'] = 1
    test['senA'] = test['senA'].apply(str)
    test['senB'] = test['senB'].apply(str)
    print(train.shape,test.shape)
    
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, 
                                           do_lower_case=True)
    
    train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                        text_a = x[DATA_COLUMN_A],
                                                                        text_b = x[DATA_COLUMN_B],
                                                                        label = x[LABEL_COLUMN]), 
                                       axis = 1)

    test_InputExamples = test.apply(lambda x: run_classifier.InputExample(guid=None,
                                                                           text_a = x[DATA_COLUMN_A],
                                                                           text_b = x[DATA_COLUMN_B],
                                                                           label = x[LABEL_COLUMN]), 
                                     axis = 1)
                                            
    train_features = run_classifier.convert_examples_to_features(train_InputExamples, 
                                                  label_list, 
                                                  FLAGS.max_seq_length, 
                                                  tokenizer)
    test_features = run_classifier.convert_examples_to_features(test_InputExamples, 
                                                 label_list, 
                                                 FLAGS.max_seq_length, 
                                                 tokenizer)

    
    OUTPUT_DIR = os.path.join(FLAGS.model_output_dir,FLAGS.category_name+"_BertQA")
    tf.gfile.MakeDirs(OUTPUT_DIR)
    
    run_config = tf.estimator.RunConfig(
                                    model_dir=OUTPUT_DIR,
                                    keep_checkpoint_max=2,
                                    save_summary_steps=FLAGS.save_summary_steps,
                                    save_checkpoints_steps=FLAGS.save_checkpoints_steps)

    num_train_steps = int(len(train_features) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
    
    
    model_fn = model_fn_builder(
                            bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file),
                            num_labels = len(label_list),
                            init_checkpoint = FLAGS.init_checkpoint,
                            learning_rate = FLAGS.learning_rate,
                            num_train_steps = num_train_steps,
                            num_warmup_steps = num_warmup_steps)

    estimator = tf.estimator.Estimator(
                                   model_fn = model_fn,
                                   config = run_config,
                                   params = {"batch_size": FLAGS.train_batch_size})

    train_input_fn = run_classifier.input_fn_builder(
                                                 features = train_features,
                                                 seq_length = FLAGS.max_seq_length,
                                                 is_training = True,
                                                 drop_remainder = True)

    print("Beginning Training!")
    current_time = datetime.now()
    #early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook(
    #                 estimator,metric_name='loss',max_steps_without_decrease=1000,min_steps=100)

    estimator.train(input_fn = train_input_fn, max_steps = num_train_steps) #,hooks=[early_stopping]
    print("Training took time ", datetime.now() - current_time)
    
    test_input_fn = run_classifier.input_fn_builder(
                                                features = test_features,
                                                seq_length = FLAGS.max_seq_length,
                                                is_training = False,
                                                drop_remainder = True)
                                                

    predictions = estimator.predict(test_input_fn)
    x=[prediction['scores'] for prediction in predictions]
    print('\n')
    print("The accuracy of BertQA on "+FLAGS.category_name+" is: "+str(sum(i > 0 for i in x)/len(x)))
    print('\n')
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default="data/VSNLI/",
        type=str,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default="snliimg",
                        type=str,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default="output_vsnli",
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--tagger_path",
        default=None,
        type=str,
        help=
        "tagger_path for predictions if needing real-time tagging. Default: None, by loading pre-tagged data"
        "For example, the trained models by AllenNLP")
    parser.add_argument("--best_epochs",
                        default=1.0,
                        type=float,
                        help="Best training epochs for prediction.")
    parser.add_argument("--max_num_aspect",
                        default=3,
                        type=int,
                        help="max_num_aspect")

    ## Other parameters
    parser.add_argument("--grounding",
                        action='store_true',
                        help="whether to enable grounding.")
    parser.add_argument("--hypothesis_only",
                        action='store_true',
                        help="whether to enable grounding.")

    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"snliimg": SnliImgProcessor, "gsnliimg": GSnliImgProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    # num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    if args.tagger_path != None:
        srl_predictor = SRLPredictor(args.tagger_path)
    else:
        srl_predictor = None
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        if args.grounding:
            train_premise_examples, train_hypothesis_examples = processor.get_train_examples(
                args.data_dir)
            train_examples = (train_premise_examples,
                              train_hypothesis_examples)
            num_train_optimization_steps = int(
                len(train_premise_examples) / args.train_batch_size /
                args.gradient_accumulation_steps) * args.num_train_epochs
        else:
            train_examples = processor.get_train_examples(args.data_dir)
            num_train_optimization_steps = int(
                len(train_examples) / args.train_batch_size /
                args.gradient_accumulation_steps) * args.num_train_epochs

        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    train_features = None
    if args.do_train:
        if args.grounding:
            hypothesis_features = convert_examples_to_features(
                train_examples[1],
                label_list,
                args.max_seq_length,
                tokenizer,
                srl_predictor=srl_predictor)

            premises_features = convert_examples_to_features(
                train_examples[0],
                label_list,
                args.max_seq_length,
                tokenizer,
                srl_predictor=srl_predictor)
            train_features = (premises_features, hypothesis_features)

        else:
            train_features = convert_examples_to_features(
                train_examples,
                label_list,
                args.max_seq_length,
                tokenizer,
                srl_predictor=srl_predictor)
        # TagTokenizer.make_tag_vocab("tag_vocab", tag_vocab)
    tag_tokenizer = TagTokenizer()
    vocab_size = len(tag_tokenizer.ids_to_tags)
    print("tokenizer vocab size: ", str(vocab_size))
    tag_config = TagConfig(tag_vocab_size=vocab_size,
                           hidden_size=10,
                           layer_num=1,
                           output_dim=10,
                           dropout_prob=0.1,
                           num_aspect=args.max_num_aspect)
    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))
    if args.grounding:
        if args.hypothesis_only:
            model = GroundedImgClassificationTag.from_pretrained(
                args.bert_model,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
                'distributed_{}'.format(args.local_rank),
                num_labels=num_labels,
                tag_config=tag_config,
                image_emb_size=2048,
                hypothesis_only=True)
        else:
            model = GroundedImgClassificationTag.from_pretrained(
                args.bert_model,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
                'distributed_{}'.format(args.local_rank),
                num_labels=num_labels,
                tag_config=tag_config,
                image_emb_size=2048)
    else:
        model = BertForSequenceImgClassificationTag.from_pretrained(
            args.bert_model,
            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
            'distributed_{}'.format(args.local_rank),
            num_labels=num_labels,
            tag_config=tag_config,
            image_emb_size=2048)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    best_epoch = 0
    best_result = 0.0
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    if args.do_train:
        if not args.grounding:
            train_features = transform_tag_features(args.max_num_aspect,
                                                    train_features,
                                                    tag_tokenizer,
                                                    args.max_seq_length)
            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", args.train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)
            # prepare data training data
            all_input_ids = [f.input_ids for f in train_features]
            all_input_mask = [f.input_mask for f in train_features]
            all_segment_ids = [f.segment_ids for f in train_features]
            all_label_ids = [f.label_id for f in train_features]
            all_start_end_idx = [
                f.orig_to_token_split_idx for f in train_features
            ]
            all_input_tag_ids = [f.input_tag_ids for f in train_features]
            all_images = [f.image for f in train_features]
            train_data = SequenceImageDataset(all_input_ids, all_input_mask,
                                              all_segment_ids,
                                              all_start_end_idx,
                                              all_input_tag_ids, all_label_ids,
                                              all_images, transform, IMAGE_DIR)

        else:
            premises_train_features = transform_tag_features(
                args.max_num_aspect, train_features[0], tag_tokenizer,
                args.max_seq_length)
            hypothesis_train_features = transform_tag_features(
                args.max_num_aspect, train_features[1], tag_tokenizer,
                args.max_seq_length)

            assert len(premises_train_features) == len(
                hypothesis_train_features)
            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples[0]))
            logger.info("  Batch size = %d", args.train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            # prepare the premise training data
            all_premises_input_ids = [
                f.input_ids for f in premises_train_features
            ]
            all_premises_input_mask = [
                f.input_mask for f in premises_train_features
            ]
            all_premises_segment_ids = [
                f.segment_ids for f in premises_train_features
            ]
            all_premises_start_end_idx = [
                f.orig_to_token_split_idx for f in premises_train_features
            ]
            all_premises_input_tag_ids = [
                f.input_tag_ids for f in premises_train_features
            ]

            # prepare the hypothesis training data
            all_hypothesis_input_ids = [
                f.input_ids for f in hypothesis_train_features
            ]
            all_hypothesis_input_mask = [
                f.input_mask for f in hypothesis_train_features
            ]
            all_hypothesis_segment_ids = [
                f.segment_ids for f in hypothesis_train_features
            ]
            all_hypothesis_start_end_idx = [
                f.orig_to_token_split_idx for f in hypothesis_train_features
            ]
            all_hypothesis_input_tag_ids = [
                f.input_tag_ids for f in hypothesis_train_features
            ]

            all_images = [f.image for f in premises_train_features]
            all_label_ids = [f.label_id for f in premises_train_features]

            train_data = GroundedSequenceImageDataset(
                all_premises_input_ids, all_hypothesis_input_ids,
                all_premises_input_mask, all_hypothesis_input_mask,
                all_premises_segment_ids, all_hypothesis_segment_ids,
                all_premises_start_end_idx, all_hypothesis_start_end_idx,
                all_premises_input_tag_ids, all_hypothesis_input_tag_ids,
                all_label_ids, all_images, transform, IMAGE_DIR)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        # prepare validation data

        if args.grounding:
            eval_premise_examples, eval_hypothesis_examples = processor.get_dev_examples(
                args.data_dir)
            eval_hypothesis_features = convert_examples_to_features(
                eval_hypothesis_examples,
                label_list,
                args.max_seq_length,
                tokenizer,
                srl_predictor=srl_predictor)

            eval_premises_features = convert_examples_to_features(
                eval_premise_examples,
                label_list,
                args.max_seq_length,
                tokenizer,
                srl_predictor=srl_predictor)

            eval_premises_features = transform_tag_features(
                args.max_num_aspect, eval_premises_features, tag_tokenizer,
                args.max_seq_length)

            eval_hypothesis_features = transform_tag_features(
                args.max_num_aspect, eval_hypothesis_features, tag_tokenizer,
                args.max_seq_length)

            # prepare the premise training data
            all_premises_input_ids = [
                f.input_ids for f in eval_premises_features
            ]
            all_premises_input_mask = [
                f.input_mask for f in eval_premises_features
            ]
            all_premises_segment_ids = [
                f.segment_ids for f in eval_premises_features
            ]
            all_premises_start_end_idx = [
                f.orig_to_token_split_idx for f in eval_premises_features
            ]
            all_premises_input_tag_ids = [
                f.input_tag_ids for f in eval_premises_features
            ]

            # prepare the hypothesis training data
            all_hypothesis_input_ids = [
                f.input_ids for f in eval_hypothesis_features
            ]
            all_hypothesis_input_mask = [
                f.input_mask for f in eval_hypothesis_features
            ]
            all_hypothesis_segment_ids = [
                f.segment_ids for f in eval_hypothesis_features
            ]
            all_hypothesis_start_end_idx = [
                f.orig_to_token_split_idx for f in eval_hypothesis_features
            ]
            all_hypothesis_input_tag_ids = [
                f.input_tag_ids for f in eval_hypothesis_features
            ]

            all_images = [f.image for f in eval_hypothesis_features]
            all_label_ids = [f.label_id for f in eval_hypothesis_features]

            eval_data = GroundedSequenceImageDataset(
                all_premises_input_ids, all_hypothesis_input_ids,
                all_premises_input_mask, all_hypothesis_input_mask,
                all_premises_segment_ids, all_hypothesis_segment_ids,
                all_premises_start_end_idx, all_hypothesis_start_end_idx,
                all_premises_input_tag_ids, all_hypothesis_input_tag_ids,
                all_label_ids, all_images, transform, IMAGE_DIR)

        else:
            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples,
                label_list,
                args.max_seq_length,
                tokenizer,
                srl_predictor=srl_predictor)
            eval_features = transform_tag_features(args.max_num_aspect,
                                                   eval_features,
                                                   tag_tokenizer,
                                                   args.max_seq_length)

            all_input_ids = [f.input_ids for f in eval_features]
            all_input_mask = [f.input_mask for f in eval_features]
            all_segment_ids = [f.segment_ids for f in eval_features]
            all_label_ids = [f.label_id for f in eval_features]
            all_start_end_idx = [
                f.orig_to_token_split_idx for f in eval_features
            ]
            all_input_tag_ids = [f.input_tag_ids for f in eval_features]
            all_images = [f.image for f in eval_features]

            eval_data = SequenceImageDataset(all_input_ids, all_input_mask,
                                             all_segment_ids,
                                             all_start_end_idx,
                                             all_input_tag_ids, all_label_ids,
                                             all_images, transform, IMAGE_DIR)

            logger.info("***** Evaluation data *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, start_end_idx, input_tag_ids, images, label_ids = batch

                loss = model(input_ids, segment_ids, input_mask, start_end_idx,
                             input_tag_ids, images, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_optimization_steps,
                        args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            # Save a trained model
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(args.output_dir,
                                             str(epoch) + "_pytorch_model.bin")
            if args.do_train:
                torch.save(model_to_save.state_dict(), output_model_file)

            # run evaluation on dev data
            model_state_dict = torch.load(output_model_file)

            if not args.grounding:

                predict_model = BertForSequenceImgClassificationTag.from_pretrained(
                    args.bert_model,
                    state_dict=model_state_dict,
                    num_labels=num_labels,
                    tag_config=tag_config,
                    image_emb_size=2048)
            else:
                predict_model = GroundedImgClassificationTag.from_pretrained(
                    args.bert_model,
                    state_dict=model_state_dict,
                    num_labels=num_labels,
                    tag_config=tag_config,
                    image_emb_size=2048)

            predict_model.to(device)
            predict_model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            total_precision = np.zeros(3)
            total_recall = np.zeros(3)
            total_fscore = np.zeros(3)
            total_support = np.zeros(3, dtype=int)

            output_logits_file = os.path.join(
                args.output_dir,
                str(epoch) + "_eval_logits_results.tsv")
            with open(output_logits_file, "w") as writer:
                writer.write("index" + "\t" + "\t".join(
                    ["logits " + str(i)
                     for i in range(len(label_list))]) + "\n")

                for batch_number, batch in enumerate(
                        tqdm(eval_dataloader, desc="Evaluating")):
                    input_ids, input_mask, segment_ids, start_end_idx, input_tag_ids, images, label_ids = batch
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)
                    start_end_idx = start_end_idx.to(device)
                    input_tag_ids = input_tag_ids.to(device)
                    images = images.to(device)
                    with torch.no_grad():
                        tmp_eval_loss = predict_model(input_ids, segment_ids,
                                                      input_mask,
                                                      start_end_idx,
                                                      input_tag_ids, images,
                                                      label_ids)
                        logits = predict_model(input_ids, segment_ids,
                                               input_mask, start_end_idx,
                                               input_tag_ids, images, None)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    tmp_eval_accuracy = accuracy_score(label_ids,
                                                       np.argmax(logits,
                                                                 axis=1),
                                                       normalize=False)
                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    precision, recall, fscore, support = precision_recall_fscore_support(
                        label_ids, np.argmax(logits, axis=1), labels=[0, 1, 2])
                    total_precision = total_precision + precision
                    total_recall = total_recall + recall
                    total_fscore = total_fscore + fscore
                    total_support = total_support + support

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

                del predict_model
                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = eval_accuracy / nb_eval_examples
                total_precision = total_precision / (batch_number + 1)
                total_recall = total_recall / (batch_number + 1)
                total_fscore = total_fscore / (batch_number + 1)

                if eval_accuracy > best_result:
                    best_epoch = epoch
                    best_result = eval_accuracy
                loss = tr_loss / nb_tr_steps if args.do_train else None

                result = {
                    'eval_loss': eval_loss,
                    'loss': loss,
                    'eval_accuracy': eval_accuracy,
                    'total_precision': {
                        k: total_precision.tolist()[v]
                        for k, v in processor.get_labels_map().items()
                    },
                    'total_recall': {
                        k: total_recall.tolist()[v]
                        for k, v in processor.get_labels_map().items()
                    },
                    'total_fscore': {
                        k: total_fscore.tolist()[v]
                        for k, v in processor.get_labels_map().items()
                    },
                    'total_support': {
                        k: total_support.tolist()[v]
                        for k, v in processor.get_labels_map().items()
                    },
                    'macro_precision': total_precision.mean(),
                    'macro_recall': total_recall.mean(),
                    'macro_support': total_support.sum(),
                    'macro_f1score': total_fscore.mean(),
                    'number_of_examples': nb_eval_examples
                }

            output_eval_file = os.path.join(args.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("Epoch: %s,  %s = %s", str(epoch), key,
                                str(result[key]))
                    writer.write("Epoch: %s, %s = %s\n" %
                                 (str(epoch), key, str(result[key])))
        logger.info("best epoch: %s, result:  %s", str(best_epoch),
                    str(best_result))
Example #10
0
def BerQA_train_predict(data, is_training=True):
    d = data.copy()
    scores = []
    max_inputs = 30000

    LEARNING_RATE = 2e-5
    NUM_TRAIN_EPOCHS = 2
    WARMUP_PROPORTION = 0.1
    # Model configs
    SAVE_CHECKPOINTS_STEPS = 1000
    SAVE_SUMMARY_STEPS = 500
    num_train_steps = 100
    max_steps = 100000

    DATA_COLUMN_A = 'senA'
    DATA_COLUMN_B = 'senB'
    LABEL_COLUMN = 'Label'
    label_list = [0, 1]

    while (len(d) > 0 and num_train_steps <= max_steps):
        line = min(max_inputs, len(d))
        temp = d[:line]
        temp_t = temp.apply(qar_pair, axis=1)
        temp_t = temp_t.tolist()
        flat_list = [item for sublist in temp_t for item in sublist]
        temp_t = pd.DataFrame(flat_list, columns=['senA', 'senB'])
        temp_t['Label'] = 1
        temp_t['senA'] = temp_t['senA'].apply(str)
        temp_t['senB'] = temp_t['senB'].apply(str)

        temp_InputExamples = temp_t.apply(
            lambda x: run_classifier.InputExample(guid=None,
                                                  text_a=x[DATA_COLUMN_A],
                                                  text_b=x[DATA_COLUMN_B],
                                                  label=x[LABEL_COLUMN]),
            axis=1)

        temp_features = run_classifier.convert_examples_to_features(
            temp_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
        num_train_steps = int(
            len(temp_features) / BATCH_SIZE *
            NUM_TRAIN_EPOCHS) + num_train_steps
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
        # Specify outpit directory and number of checkpoint steps to save
        run_config = tf.estimator.RunConfig(
            model_dir=OUTPUT_DIR,
            save_summary_steps=SAVE_SUMMARY_STEPS,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

        model_fn = model_fn_builder(num_labels=len(label_list),
                                    learning_rate=LEARNING_RATE,
                                    num_train_steps=num_train_steps,
                                    num_warmup_steps=num_warmup_steps)

        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config,
                                           params={"batch_size": BATCH_SIZE})

        input_fn = input_fn_builder(features=temp_features,
                                    seq_length=MAX_SEQ_LENGTH,
                                    is_training=is_training,
                                    drop_remainder=True)

        if is_training:
            print('Beginning Training!')
            early_stopping = tf.contrib.estimator.stop_if_no_decrease_hook(
                estimator,
                metric_name='loss',
                max_steps_without_decrease=1000,
                min_steps=100)
            current_time = datetime.now()
            #tf.estimator.train_and_evaluate(estimator,train_spec=tf.estimator.TrainSpec(input_fn, hooks=[early_stopping]))
            estimator.train(input_fn=input_fn,
                            max_steps=num_train_steps,
                            hooks=[early_stopping])
            print("Training took time ", datetime.now() - current_time)
        else:
            predictions = estimator.predict(input_fn)
            outputs = [(prediction['probabilities'], prediction['crq'])
                       for prediction in predictions]
            x = [i[0] for i in outputs]
            y = [i[1] for i in outputs]
            print('\n')
            print('Accuracy of ' + category + ' is: ' +
                  str(sum(i > 0 for i in x) / len(x)))
            print('\n')
            scores = scores + y

        if len(d) > max_inputs:
            d = d[line:]
            d = d.reset_index(drop=True)
        else:
            d = []

    if is_training is False:
        data = data[:len(scores)]
        scores = [item.tolist() for item in scores]
        #BERTQA_scores = pd.DataFrame(data=scores)
        data['BERTQA_scores'] = scores
        #data = pd.concat([data,BERTQA_scores],axis=1,ignore_index=True)
        return data