Beispiel #1
0
    def _load_edge_model(self, bert_model_file, bert_config_file):

        bert_config = BertConfig.from_json_file(bert_config_file)
        model = BertEdgeScorer(bert_config)
        model_states = torch.load(bert_model_file)
        print(model_states.keys())
        model.bert.load_state_dict(model_states)

        model.cuda()
        model.eval()
        return model
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    """Convert a bert model checkpoint for TensorFlow to PyTorch."""
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--bert_config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to the configuration file for the BERT model.")

    ## Other parameters
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--discr",
                        default=False,
                        action='store_true',
                        help="Whether to do discriminative fine-tuning.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--accumulate_gradients",
                        type=int,
                        default=1,
                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed', 
                        type=int, 
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")
    parser.add_argument('--layers',
                        type=int,
                        nargs='+',
                        default=[-2],
                        help="choose the layers that used for downstream tasks, "
                             "-2 means use pooled output, -1 means all layer,"
                             "else means the detail layers. default is -2")
    parser.add_argument('--num_datas',
                        default=None,
                        type=int,
                        help="the number of data examples")
    parser.add_argument('--num_test_datas',
                        default=None,
                        type=int,
                        help="the number of data examples"
                        )
    parser.add_argument('--pooling_type',
                        default=None,
                        type=str,
                        choices=[None, 'mean', 'max'])
    args = parser.parse_args()

    processors = {
        "sst": SSTProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
                            args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    summary_writer = SummaryWriter(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained("bert-large-cased")

    bert_config = BertConfig.from_json_file(args.bert_config_file)
    model = BertForSequenceClassification(bert_config, len(label_list), args.layers, pooling=args.pooling_type)

    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)


    no_decay = ['bias', 'gamma', 'beta']

    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]

    optimizer = AdamW(optimizer_parameters,
                         lr=args.learning_rate,
                         correct_bias=False)

    global_step = 0
    global_train_step = 0

    all_examples = processor.get_all_examples(args.data_dir)

    all_features = convert_examples_to_features(
        all_examples, label_list, args.max_seq_length, tokenizer)

    all_input_ids = all_features['input_ids']
    all_input_mask = all_features['attention_mask']
    all_segment_ids = all_features['token_type_ids']
    all_label_ids = all_features['labels']

    all_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    train_data, eval_data = random_split(all_data, [100000, 12428])

    eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False)

    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Batch size = %d", args.train_batch_size)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        print("TOTAL STEPS: ", (len(train_dataloader)*int(args.num_train_epochs)))

        epoch=0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            epoch+=1
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, token_type_ids, label_ids = batch
                loss, _ = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()    # We have accumulated enought gradients
                    # scheduler.step()

                    summary_writer.add_scalar('Loss/train', loss.item(), global_step)

                    # possibly comment this out
                    max_grad_norm = 1.0
                    _clip_grad_norm(optimizer_parameters, max_grad_norm)
                    model.zero_grad()
                    global_step += 1

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            pos_eval_prec, pos_eval_recall, pos_eval_f1 = 0, 0, 0
            neg_eval_prec, neg_eval_recall, neg_eval_f1 = 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            with open(os.path.join(args.output_dir, "results_ep"+str(epoch)+".txt"),"w") as f:
                for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluate"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.detach().to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output in outputs:
                        f.write(str(output)+"\n")
                    tmp_eval_accuracy=np.sum(outputs == label_ids)
                    tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_neg_sent(outputs, label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy
                    neg_eval_prec += tmp_eval_prec
                    neg_eval_recall += tmp_eval_recall
                    neg_eval_f1 += tmp_eval_f1

                    tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_pos_sent(outputs, label_ids)
                    pos_eval_prec += tmp_eval_prec
                    pos_eval_recall += tmp_eval_recall
                    pos_eval_f1 += tmp_eval_f1

                    global_train_step += 1

                    summary_writer.add_scalar("Loss/test", tmp_eval_loss.mean().item(), global_train_step)
                    summary_writer.add_scalar("Accuracy/test", tmp_eval_accuracy, global_train_step)

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples

            pos_eval_prec = pos_eval_prec / nb_eval_steps
            pos_eval_recall = pos_eval_recall / nb_eval_steps
            pos_eval_f1 = pos_eval_f1 / nb_eval_steps
            
            neg_eval_prec = neg_eval_prec / nb_eval_steps
            neg_eval_recall = neg_eval_recall / nb_eval_steps
            neg_eval_f1 = neg_eval_f1 / nb_eval_steps

            result = {'eval_loss': eval_loss,
                      'eval_accuracy': eval_accuracy,
                      'global_step': global_step,
                      'loss': tr_loss/nb_tr_steps,
                      'pos_eval_precision': pos_eval_prec,
                      'neg_eval_precision': neg_eval_prec,
                      'pos_eval_recall': pos_eval_recall,
                      'neg_eval_recall': neg_eval_recall,
                      'pos_eval_f1': pos_eval_f1,
                      'neg_eval_f1': neg_eval_f1}

            summary_writer.add_scalar("Epoch_loss/train", tr_loss, epoch)
            summary_writer.add_scalar("Epoch_loss/test", eval_loss, epoch)
            summary_writer.add_scalar("Epoch_accuracy/test", eval_accuracy, epoch)

            summary_writer.add_scalar("Epoch_positive_precision/test", pos_eval_prec, epoch)
            summary_writer.add_scalar("Epoch_negative_precision/test", neg_eval_prec, epoch)

            summary_writer.add_scalar("Epoch_positive_recall/test", pos_eval_recall, epoch)
            summary_writer.add_scalar("Epoch_negative_recall/test", neg_eval_recall, epoch)

            summary_writer.add_scalar("Epoch_positive_f1/test", pos_eval_f1, epoch)
            summary_writer.add_scalar("Epoch_negative_f1/test", neg_eval_f1, epoch)

            output_eval_file = os.path.join(args.output_dir, "eval_results_ep"+str(epoch)+".txt")
            print("output_eval_file=",output_eval_file)
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
            print("Saving model")
            torch.save(model.module.state_dict(), os.path.join(args.output_dir, "sst2-finetuned-bert-model_"+str(epoch)+".pth"))