Ejemplo n.º 1
0
def build_model(args):
    if args.clf_model.lower() == "cnn":
        # easy for text tokenization
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        model = CNN_Text(args)

    elif args.clf_model.lower() == "robert":
        print("name is {}".format(args.model_name_or_path))
        tokenizer = RobertaTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = RobertaConfig.from_pretrained(args.model_name_or_path,
                                               num_labels=args.num_labels,
                                               finetuning_task=args.task_name)

        model = RobertaForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        if args.freeze:
            for n, p in model.named_parameters():
                if "bert" in n:
                    p.requires_grad = False
    elif args.clf_model.lower() == "bert":
        tokenizer = BertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = BertConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=args.num_labels,
                                            finetuning_task=args.task_name)

        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        # if args.freeze:
        #     for n, p in model.named_parameters():
        #         if "bert" in n:
        #             p.requires_grad = False

    else:
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        config = DistilBertConfig.from_pretrained(
            args.model_name_or_path,
            num_labels=args.num_labels,
            finetuning_task=args.task_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)

    model.expand_class_head(args.multi_head)
    model = model.to(args.device)
    return tokenizer, model
Ejemplo n.º 2
0
def main():
    import config as args

    processors = {
        "bert": bertProcessor,
        "bertf1c": bertf1cProcessor,
        "berts": bertsProcessor,
        "bertsf1c": bertsf1cProcessor,
    }

    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    logger.info("device %s n_gpu %d", device, n_gpu)

    # args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and 'model.pt' in os.listdir(
            args.output_dir):
        if args.do_train and not args.resume:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    print(label_list)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / n_class / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    model = BertForSequenceClassification(args.bert_dir, 1)
    model.to(device)

    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params': [p for n, p in param_optimizer if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in param_optimizer if n in no_decay],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0

    if args.do_eval:
        eval_examples = processor.get_test_examples(
            args.data_dir)  #### for test datasets
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)

        input_ids = []
        input_mask = []
        segment_ids = []
        label_id = []

        for f in eval_features:
            input_ids.append([])
            input_mask.append([])
            segment_ids.append([])
            for i in range(n_class):
                input_ids[-1].append(f[i].input_ids)
                input_mask[-1].append(f[i].input_mask)
                segment_ids[-1].append(f[i].segment_ids)
            label_id.append([f[0].label_id])

        all_input_ids = torch.tensor(input_ids, dtype=torch.long)
        all_input_mask = torch.tensor(input_mask, dtype=torch.long)
        all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
        all_label_ids = torch.tensor(label_id, dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

    if args.do_train:
        best_metric = 0

        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        input_ids = []
        input_mask = []
        segment_ids = []
        label_id = []
        for f in train_features:
            input_ids.append([])
            input_mask.append([])
            segment_ids.append([])
            for i in range(n_class):
                input_ids[-1].append(f[i].input_ids)
                input_mask[-1].append(f[i].input_mask)
                segment_ids[-1].append(f[i].segment_ids)
            label_id.append([f[0].label_id])

        all_input_ids = torch.tensor(input_ids, dtype=torch.long)
        all_input_mask = torch.tensor(input_mask, dtype=torch.long)
        all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
        all_label_ids = torch.tensor(label_id, dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                loss, _ = model(input_ids, segment_ids, input_mask, label_ids,
                                1)

                loss = loss.mean()
                # if args.gradient_accumulation_steps > 1:
                #     loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                #if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                global_step += 1

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            logits_all = []
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    tmp_eval_loss, logits = model(input_ids, segment_ids,
                                                  input_mask, label_ids, 1)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                for i in range(len(logits)):
                    logits_all += [logits[i]]

                tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1))

                eval_loss += tmp_eval_loss.mean().item()
                eval_accuracy += tmp_eval_accuracy

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples

            if args.do_train:
                result = {
                    'eval_loss': eval_loss,
                    'global_step': global_step,
                    'loss': tr_loss / nb_tr_steps
                }
            else:
                result = {'eval_loss': eval_loss}

            eval_f1, eval_T2 = f1_eval(logits_all, eval_features)
            result["f1"] = eval_f1
            result["T2"] = eval_T2

            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))

            if eval_f1 >= best_metric:
                torch.save(model.state_dict(),
                           os.path.join(args.output_dir, "model_best.pt"))
                best_metric = eval_f1

        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "model_best.pt")))
        torch.save(model.state_dict(), os.path.join(args.output_dir,
                                                    "model.pt"))

    model.load_state_dict(torch.load(os.path.join(args.output_dir,
                                                  "model.pt")))

    if args.do_eval:
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps, nb_eval_examples = 0, 0
        logits_all = []
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids,
                                              input_mask, label_ids, 1)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            for i in range(len(logits)):
                logits_all += [logits[i]]

            eval_loss += tmp_eval_loss.mean().item()

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps

        if args.do_train:
            result = {
                'eval_loss': eval_loss,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps
            }
        else:
            result = {'eval_loss': eval_loss}

        output_eval_file = os.path.join(args.output_dir,
                                        "eval_results_dev.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
        output_eval_file = os.path.join(args.output_dir, "logits_dev.txt")
        with open(output_eval_file, "w") as f:
            for i in range(len(logits_all)):
                for j in range(len(logits_all[i])):
                    f.write(str(logits_all[i][j]))
                    if j == len(logits_all[i]) - 1:
                        f.write("\n")
                    else:
                        f.write(" ")

        eval_examples = processor.get_test_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        input_ids = []
        input_mask = []
        segment_ids = []
        label_id = []

        for f in eval_features:
            input_ids.append([])
            input_mask.append([])
            segment_ids.append([])
            for i in range(n_class):
                input_ids[-1].append(f[i].input_ids)
                input_mask[-1].append(f[i].input_mask)
                segment_ids[-1].append(f[i].segment_ids)
            label_id.append([f[0].label_id])

        all_input_ids = torch.tensor(input_ids, dtype=torch.long)
        all_input_mask = torch.tensor(input_mask, dtype=torch.long)
        all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
        all_label_ids = torch.tensor(label_id, dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps, nb_eval_examples = 0, 0
        logits_all = []
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids,
                                              input_mask, label_ids, 1)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            for i in range(len(logits)):
                logits_all += [logits[i]]

            eval_loss += tmp_eval_loss.mean().item()

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps

        if args.do_train:
            result = {
                'eval_loss': eval_loss,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps
            }
        else:
            result = {'eval_loss': eval_loss}

        output_eval_file = os.path.join(args.output_dir,
                                        "eval_results_test.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
        output_eval_file = os.path.join(args.output_dir, "logits_test.txt")
        with open(output_eval_file, "w") as f:
            for i in range(len(logits_all)):
                for j in range(len(logits_all[i])):
                    f.write(str(logits_all[i][j]))
                    if j == len(logits_all[i]) - 1:
                        f.write("\n")
                    else:
                        f.write(" ")
Ejemplo n.º 3
0
            mge.tensor(t) for t in batch
        )
        batch_size = input_ids.shape[0]
        loss, logits, label_ids = net_eval(
            input_ids, segment_ids, input_mask, label_ids, net=net
        )
        sum_loss += loss.mean().item()
        sum_accuracy += accuracy(logits, label_ids)
        total_examples += batch_size
        total_steps += 1

    result = {
        "eval_loss": sum_loss / total_steps,
        "eval_accuracy": sum_accuracy / total_examples,
    }

    logger.info("***** Eval results *****")
    for key in sorted(result.keys()):
        logger.info("%s = %s", key, str(result[key]))


if __name__ == "__main__":
    bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=False)
    args.vocab_file = vocab_file
    model = BertForSequenceClassification(config, num_labels=2, bert=bert)
    mrpc_dataset = MRPCDataset(args)
    model.load_state_dict(mge.load(args.load_model_path))
    mrpc_dataset = MRPCDataset(args)
    eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader()
    eval(eval_dataloader, model)
Ejemplo n.º 4
0
def train(args, train_dataset, tokenizer):
    """ Train the model """
    # Load the pretrianed model
    nn.load_parameters(args.pretrained_model)
    # Drop final layer for task-specific fine-tuning
    nn.parameter.pop_parameter('affine_seq_class/affine/W')
    nn.parameter.pop_parameter('affine_seq_class/affine/b')

    train_dataloader = data_iterator(
        train_dataset, batch_size=args.train_batch_size)

    global_step = 0
    train_loss = 0.0
    model = BertForSequenceClassification()

    input_ids = nn.Variable((args.train_batch_size, args.max_seq_length))
    attention_mask = nn.Variable((args.train_batch_size, args.max_seq_length))
    token_type_ids = nn.Variable((args.train_batch_size, args.max_seq_length))
    labels = nn.Variable((args.train_batch_size, ))

    input_ids_eval = nn.Variable((args.eval_batch_size, args.max_seq_length))
    attention_mask_eval = nn.Variable(
        (args.eval_batch_size, args.max_seq_length))
    token_type_ids_eval = nn.Variable(
        (args.eval_batch_size, args.max_seq_length))
    labels_eval = nn.Variable((args.eval_batch_size, ))

    activation = F.gelu
    if args.activation == 'relu':
        activation = F.relu
    loss, _, train_error = model(args, input_ids=input_ids, attention_mask=attention_mask,
                                 token_type_ids=token_type_ids, labels=labels,
                                 num_labels=args.num_labels, vocab_size=args.vocab_size,
                                 num_embed_dim=args.num_embed_dim,
                                 num_pos_ids=args.num_position_ids,
                                 num_attention_layers=args.num_attention_layers,
                                 num_attention_embed_dim=args.num_attention_embed_dim,
                                 num_attention_heads=args.num_attention_heads,
                                 num_attention_dim_feedforward=args.num_attention_dim_feedforward,
                                 attention_activation=activation, pool_outmap=args.num_pool_outmap,
                                 embed_dropout_prob=args.embed_dropout,
                                 attention_dropout_prob=args.attention_dropout,
                                 dropout_prob=args.last_dropout, test=False)

    loss.persistent = True
    if args.solver == 'Adam':
        solver = S.Adam(args.learning_rate, eps=args.adam_epsilon)
    else:
        solver = S.AdamW(args.learning_rate, eps=args.adam_epsilon)
    solver.set_parameters(nn.get_parameters())

    monitor = Monitor(args.output_dir)
    monitor_loss = MonitorSeries(
        "Training Loss", monitor, interval=10)
    monitor_eloss = MonitorSeries(
        "Evaluation Loss", monitor, interval=10)
    monitor_train_error = MonitorSeries(
        "Training Error Rate", monitor, interval=10)
    monitor_lr = MonitorSeries(
        "learning Rate", monitor, interval=10)

    total_steps = train_dataloader.size // args.train_batch_size
    var_linear = total_steps * args.num_train_epochs
    var_warmup = total_steps * (args.num_train_epochs - 1)
    for epoch in range(args.num_train_epochs):
        logger.info("Starting Epoch %d out of %d",
                    epoch+1, args.num_train_epochs)
        for it in range(total_steps):
            batch = train_dataloader.next()
            input_ids.d = batch[0]
            attention_mask.d = batch[1]
            token_type_ids.d = batch[2]
            labels.d = batch[3]

            learning_rate_linear = lr_linear(global_step, var_linear)
            learning_rate = args.learning_rate * learning_rate_linear

            if epoch == 0:
                learning_rate = args.learning_rate * (global_step/total_steps)
            if epoch > 0:
                learning_rate_linear = lr_linear(
                    (global_step-total_steps), var_warmup)
                learning_rate = args.learning_rate * learning_rate_linear

            solver.zero_grad()
            nn.forward_all([loss, train_error], clear_no_need_grad=True)
            loss.backward(clear_buffer=True)
            solver.weight_decay(args.weight_decay)
            solver.clip_grad_by_norm(args.max_grad_norm)
            solver.set_learning_rate(learning_rate)
            solver.update()

            monitor_loss.add(
                (train_dataloader.size//args.train_batch_size)*epoch+it,
                loss.d.copy())
            monitor_train_error.add(
                (train_dataloader.size//args.train_batch_size)*epoch+it,
                train_error.d.copy())
            monitor_lr.add(global_step, learning_rate)
            global_step += 1
            train_loss += F.mean(loss.data)

        eval_task_names = (
            "mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
        eval_outputs_dirs = (args.output_dir, args.output_dir +
                             '-MM') if args.task_name == "mnli" else (args.output_dir,)

        results = {}
        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
            print(eval_task)
            eval_dataset = BERTDataSource(
                args, tokenizer, evaluate=True, shuffle=False)
            if not os.path.exists(eval_output_dir):
                os.makedirs(eval_output_dir)

            eval_dataloader = data_iterator(
                eval_dataset, batch_size=args.eval_batch_size)
            total_eval_steps = eval_dataloader.size // args.eval_batch_size
            eval_loss = 0.0
            nb_eval_steps = 0
            preds = None
            out_label_ids = None
            tmp_eval_loss, logits, eval_error = model(args, input_ids=input_ids_eval,
                                                      attention_mask=attention_mask_eval,
                                                      token_type_ids=token_type_ids_eval, labels=labels_eval,
                                                      num_labels=args.num_labels, vocab_size=args.vocab_size,
                                                      num_embed_dim=args.num_embed_dim,
                                                      num_pos_ids=args.num_position_ids,
                                                      num_attention_layers=args.num_attention_layers,
                                                      num_attention_embed_dim=args.num_attention_embed_dim,
                                                      num_attention_heads=args.num_attention_heads,
                                                      num_attention_dim_feedforward=args.num_attention_dim_feedforward,
                                                      attention_activation=activation, pool_outmap=args.num_pool_outmap,
                                                      embed_dropout_prob=args.embed_dropout,
                                                      attention_dropout_prob=args.attention_dropout,
                                                      dropout_prob=args.last_dropout, test=True)

            tmp_eval_loss.persistent = True
            eval_loss += F.mean(tmp_eval_loss)
            for it in range(total_eval_steps):
                print(it, "  ", total_eval_steps)
                batch_eval = eval_dataloader.next()
                input_ids_eval.d = batch_eval[0]
                attention_mask_eval.d = batch_eval[1]
                token_type_ids_eval.d = batch_eval[2]
                labels_eval.d = batch_eval[3]
                nb_eval_steps += 1
                eval_loss.forward()
                monitor_eloss.add(it, eval_loss.d.copy())

                if preds is None:
                    preds = logits.d.copy()
                    out_label_ids = labels_eval.d.copy()
                else:
                    preds = np.append(preds, logits.d.copy(), axis=0)

                    out_label_ids = np.append(
                        out_label_ids, labels_eval.d.copy(), axis=0)
            eval_loss = eval_loss.d / nb_eval_steps
            if args.output_mode == "classification":
                preds = np.argmax(preds, axis=1)
            elif args.output_mode == "regression":
                preds = np.squeeze(preds)

            result = compute_metrics(eval_task, preds, out_label_ids)
            results.update(result)

            output_eval_file = os.path.join(
                eval_output_dir, "", "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                logger.info("***** Evaluation results {} *****".format(""))
                for key in sorted(result.keys()):
                    logger.info("%d  %s = %s\n", epoch +
                                1, key, str(result[key]))
                    writer.write("%d %s = %s\n" %
                                 (epoch+1, key, str(result[key])))
                print("results", results)
    return results
Ejemplo n.º 5
0
checkpoint = "./output/tacred-large/checkpoint-12000/"
pretrained_model_name = "bert-large-cased"
do_lower = ("-uncased" in pretrained_model_name)
input_file = "/home/jiaming/datasets/TACRED/data/tsv_cased/test.tsv"
# output_eval_file = "./eval/tac_res.txt"
output_eval_file = "./eval/tac_res_large.txt"
batch_size = 16
"""
Start eval
"""
additional_special_tokens = ["[E11]", "[E12]", "[E21]", "[E22]"]
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name,
    do_lower_case=do_lower,
    additional_special_tokens=additional_special_tokens)
model = BertForSequenceClassification.from_pretrained(checkpoint)
model.to(device)

eval_dataset = load_examples(input_file, tokenizer)
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset,
                             sampler=eval_sampler,
                             batch_size=batch_size,
                             shuffle=False)

eval_loss = 0.0
nb_eval_steps = 0
pred_logits = None
out_label_ids = None
input_ids = None
Ejemplo n.º 6
0
from data import read_data
from model import BertForSequenceClassification
from utils import get_project_root

# loading config params
project_root: Path = get_project_root()
with open(str(project_root / "config.yml")) as f:
    params = yaml.load(f, Loader=yaml.FullLoader)

# read and process data
train_val_loaders, test_loaders = read_data(params)

# initialize the model
model = BertForSequenceClassification(
    pretrained_model_name=params["model"]["model_name"],
    num_classes=params["model"]["num_classes"],
)

# specify criterion for the multi-class classification task, optimizer and scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=float(params["training"]["learn_rate"]))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

# reproducibility
set_global_seed(params["general"]["seed"])
prepare_cudnn(deterministic=True)

# here we specify that we pass masks to the runner. So model's forward method will be called with
# these arguments passed to it.
runner = SupervisedRunner(input_key=("features", "attention_mask"))
Ejemplo n.º 7
0
def main(i):
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir", default='Cross-Modal-BERT-master/data/text', type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default='Cross-Modal-BERT-master/pre-trained BERT', type=str,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name", default='Multi', type=str,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir", default='Cross-Modal-BERT-master/CM-BERT_output', type=str,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length", default=50, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train", default=True,
                        help="Whether to run training.'store_true'")
    parser.add_argument("--do_test", default=True,
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case", default=True,
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size", default=24, type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size", default=24, type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--test_batch_size", default=24, type=int,
                        help="Total batch size for test.")
    parser.add_argument("--learning_rate", default=2e-5, type=float,
                        help="The initial learning rate for Adam.5e-5")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed', type=int, default=11111,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    args = parser.parse_args()
    processors = {
        "multi": PgProcessor,
    }

    num_labels_task = {
        "multi": 1,
    }

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = 2
    logger.info("device: {} n_gpu: {}".format(device, n_gpu))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    seed_num = np.random.randint(1,10000)
    random.seed(seed_num)
    np.random.seed(seed_num)
    torch.manual_seed(seed_num)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed_num)

    if not args.do_train and not args.do_test:
        raise ValueError("At least one of `do_train` or `do_test` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format("-1"))
    ##############################################################################################################
    model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels)
    # Freezing all layer except for last transformer layer and its follows  

    for name, param in model.named_parameters():  
        param.requires_grad = False
        if "encoder.layer.0" in name or "encoder.layer.1" in name:
            param.requires_grad = True
        if "encoder.layer.2" in name or "encoder.layer.3" in name :
            param.requires_grad = True
        if "encoder.layer.4" in name or  "encoder.layer.5" in name:
            param.requires_grad = True
        if "encoder.layer.6" in name or "encoder.layer.7" in name:
            param.requires_grad = True
        if "encoder.layer.8" in name or "encoder.layer.9" in name :
            param.requires_grad = True
        if "encoder.layer.10" in name or  "encoder.layer.11" in name:
            param.requires_grad = True
        if "BertFinetun" in name or "pooler" in name:
            param.requires_grad = True
    ##############################################################################################################
    model.to(device)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    new_decay = ['BertFine']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(np in n for np in new_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay )and any(np in n for np in new_decay)],'lr':0.01}
    ]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    train_audio,valid_audio,test_audio= pickle.load(open('Cross-Modal-BERT-master/data/audio/MOSI_cmu_audio_CLS.pickle','rb'))
    max_acc = 0
    min_loss = 100
    if args.do_train:
        train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_train_audio = torch.tensor(train_audio, dtype=torch.float32)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float32)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_train_audio, all_label_ids)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        ## Evaluate for each epcoh
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer)
        all_valid_audio = torch.tensor(valid_audio, dtype=torch.float32,requires_grad=True)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float32)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_valid_audio,all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, all_train_audio, label_ids = batch
                loss = model(input_ids, all_train_audio,segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0

            for input_ids, input_mask, segment_ids,all_valid_audio,label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)
                all_valid_audio = all_valid_audio.to(device)
                with torch.no_grad():
                    tmp_eval_loss = model(input_ids, all_valid_audio,segment_ids, input_mask,label_ids)
                    logits,_,_ = model(input_ids,all_valid_audio, segment_ids, input_mask)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                tmp_eval_accuracy = accuracy(logits, label_ids)

                eval_loss += tmp_eval_loss.mean().item()
                eval_accuracy += tmp_eval_accuracy

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples
            loss = tr_loss/nb_tr_steps if args.do_train else None
            result = {'eval_loss': eval_loss,
                      'eval_accuracy': eval_accuracy,
                      'global_step': global_step,
                      'loss': loss}
            
            output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

            # Save a trained model and the associated configuration
            if eval_loss<min_loss:
                min_loss = eval_loss
                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                torch.save(model_to_save.state_dict(), output_model_file)
                output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
                with open(output_config_file, 'w') as f:
                    f.write(model_to_save.config.to_json_string())

    if args.do_test:
        ## Evaluate for each epcoh
        test_examples = processor.get_test_examples(args.data_dir)
        test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("")
        logger.info("***** Running test *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.test_batch_size)
        all_test_audio = torch.tensor(test_audio, dtype=torch.float32,requires_grad=True)
        all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.float32)
        test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_test_audio)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.test_batch_size)
        model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels)
        model.load_state_dict(torch.load('Cross-Modal-BERT-master/CM-BERT_output/pytorch_model.bin'))
        model.to(device)
        model.eval()
        test_loss, test_accuracy = 0, 0
        nb_test_steps, nb_test_examples = 0, 0
        predict_list = []
        truth_list = []
        text_attention_list = []
        fusion_attention_list = []
        with torch.no_grad():
            for input_ids, input_mask, segment_ids, label_ids, all_test_audio in tqdm(test_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)
                all_test_audio = all_test_audio.to(device)

                with torch.no_grad():
                    tmp_test_loss = model(input_ids, all_test_audio,segment_ids, input_mask, label_ids)
                    logits,text_attention,fusion_attention = model(input_ids, all_test_audio,segment_ids, input_mask)
                
                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                text_attention = text_attention.cpu().numpy()
                fusion_attention = fusion_attention.cpu().numpy()
                test_loss += tmp_test_loss.mean().item()

                for i in range(len(logits)):
                    predict_list.append(logits[i])
                    truth_list.append(label_ids[i])
                    text_attention_list.append(text_attention[i])
                    fusion_attention_list.append(fusion_attention[i])
                nb_test_examples += input_ids.size(0)
                nb_test_steps += 1
        
        exclude_zero = False
        non_zeros = np.array([i for i, e in enumerate(predict_list) if e != 0 or (not exclude_zero)])
        predict_list = np.array(predict_list).reshape(-1)
        truth_list = np.array(truth_list)
        predict_list1 = (predict_list[non_zeros] > 0)
        truth_list1 = (truth_list[non_zeros] > 0)
        test_loss = test_loss / nb_test_steps
        test_preds_a7 = np.clip(predict_list, a_min=-3., a_max=3.)
        test_truth_a7 = np.clip(truth_list, a_min=-3., a_max=3.)
        acc7 = accuracy_7(test_preds_a7,test_truth_a7)
        f_score = f1_score(predict_list1, truth_list1, average='weighted')
        acc = accuracy_score(truth_list1, predict_list1)
        corr = np.corrcoef(predict_list, truth_list)[0][1]
        mae = np.mean(np.absolute(predict_list - truth_list))
        loss = tr_loss/nb_tr_steps if args.do_train else None
        results = {'test_loss': test_loss,
                  'global_step': global_step,
                  'loss': loss,
                  'acc':acc,
                  'F1':f_score,
                  'mae':mae,
                  'corr':corr,
                  'acc7':acc7}
        logger.info("***** test results *****")
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
        return results
Ejemplo n.º 8
0
        optimizer.step()
        sum_loss += loss.mean().item()
        sum_accuracy += accuracy(logits, label_ids)
        total_examples += batch_size
        total_steps += 1

    result = {
        "train_loss": sum_loss / total_steps,
        "train_accuracy": sum_accuracy / total_examples,
    }

    logger.info("***** Train results *****")
    for key in sorted(result.keys()):
        logger.info("%s = %s", key, str(result[key]))


if __name__ == "__main__":
    bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=True)
    args.vocab_file = vocab_file
    model = BertForSequenceClassification(config, num_labels=2, bert=bert)
    mrpc_dataset = MRPCDataset(args)
    optimizer = optim.Adam(model.parameters(requires_grad=True), lr=args.learning_rate,)
    mrpc_dataset = MRPCDataset(args)
    train_dataloader, train_size = mrpc_dataset.get_train_dataloader()
    eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader()
    for epoch in range(args.num_train_epochs):
        logger.info("***** Epoch {} *****".format(epoch + 1))
        train(train_dataloader, model, optimizer)
        mge.save(model.state_dict(), args.save_model_path)
        eval(eval_dataloader, model)
Ejemplo n.º 9
0
from transformers import AdamW, BertConfig
from torch.autograd import Variable
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import os
import os.path
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np

from dataloader_training import dataloader
train_dataloader = dataloader()
device = torch.device("cuda")
model = BertForSequenceClassification(3)
model.cuda()
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


def save_checkpoint(state, is_best, filename='./checkpoint_4.pth.tar'):
Ejemplo n.º 10
0
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from model import BertForSequenceClassification
import torch
device = torch.device("cuda")
model = BertForSequenceClassification(3)
model.cuda()
import numpy as numpy
import json
import os
import os.path
import math
import torch.nn.functional as F
import numpy as np
import pandas as pd

# string=r"#$%&'!()*+,-.:;<=>?@[\]^_\"`{|}~"

# def remove_URL(text):
#     return re.sub(r"https?://\S+|www\.\S+", "", text)

# def remove_html(text):
#     html=re.compile(r'<.*?>')
#     return html.sub(r'',text)

# def remove_punct(text):
Ejemplo n.º 11
0
def main():
    parser = ArgumentParser(
        description="BERT for relation extraction (classification)")
    parser.add_argument('--config', dest='config')
    args = parser.parse_args()
    config = Config(args.config)

    if os.path.exists(config.output_dir) and os.listdir(
            config.output_dir
    ) and config.train and not config.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(config.output_dir))

    # Setup CUDA, GPU & distributed training
    if config.local_rank == -1 or config.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not config.no_cuda else "cpu")
        config.n_gpu = torch.cuda.device_count()
    else:
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(config.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        config.n_gpu = 1
    config.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
        config.local_rank, device, config.n_gpu, bool(config.local_rank != -1))

    # Set seed
    set_seed(config.seed)

    # Prepare GLUE task
    processor = data_processors["semeval"]()
    output_mode = output_modes["semeval"]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if config.local_rank not in [-1, 0]:
        torch.distributed.barrier()
    # Make sure only the first process in distributed training will download model & vocab
    bertconfig = BertConfig.from_pretrained(config.pretrained_model_name,
                                            num_labels=num_labels,
                                            finetuning_task=config.task_name)
    # './large-uncased-model', num_labels=num_labels, finetuning_task=config.task_name)
    bertconfig.l2_reg_lambda = config.l2_reg_lambda
    bertconfig.latent_entity_typing = config.latent_entity_typing
    if config.l2_reg_lambda > 0:
        logger.info("using L2 regularization with lambda  %.5f",
                    config.l2_reg_lambda)
    if config.latent_entity_typing:
        logger.info("adding the component of latent entity typing: %s",
                    str(config.latent_entity_typing))
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-uncased',
        do_lower_case=True,
        additional_special_tokens=additional_special_tokens)
    # 'bert-large-uncased', do_lower_case=True, additional_special_tokens=additional_special_tokens)
    model = BertForSequenceClassification.from_pretrained(
        config.pretrained_model_name, config=bertconfig)
    # './large-uncased-model', config=bertconfig)

    if config.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(config.device)

    # logger.info("Training/evaluation parameters %s", config)

    # Training
    if config.train:
        train_dataset = load_and_cache_examples(config,
                                                config.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(config, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if config.train and (config.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(config.output_dir) and config.local_rank in [
                -1, 0
        ]:
            os.makedirs(config.output_dir)

        logger.info("Saving model checkpoint to %s", config.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(config.output_dir)
        tokenizer.save_pretrained(config.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(config,
                   os.path.join(config.output_dir, 'training_config.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(
            config.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            config.output_dir,
            do_lower_case=True,
            additional_special_tokens=additional_special_tokens)
        model.to(config.device)

    # Evaluation
    results = {}
    if config.eval and config.local_rank in [-1, 0]:
        tokenizer = BertTokenizer.from_pretrained(
            config.output_dir,
            do_lower_case=True,
            additional_special_tokens=additional_special_tokens)
        checkpoints = [config.output_dir]
        if config.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(config.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            model = BertForSequenceClassification.from_pretrained(checkpoint)
            model.to(config.device)
            result = evaluate(config, model, tokenizer, prefix=global_step)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
Ejemplo n.º 12
0
        total_steps += 1

    result = {
        "train_loss": sum_loss / total_steps,
        "train_accuracy": sum_accuracy / total_examples,
    }

    logger.info("***** Train results *****")
    for key in sorted(result.keys()):
        logger.info("%s = %s", key, str(result[key]))


if __name__ == "__main__":
    bert, config, vocab_file = create_hub_bert(args.pretrained_bert,
                                               pretrained=True)
    args.vocab_file = vocab_file
    model = BertForSequenceClassification(config, num_labels=2, bert=bert)
    mrpc_dataset = MRPCDataset(args)
    optimizer = optim.Adam(
        model.parameters(requires_grad=True),
        lr=args.learning_rate,
    )
    mrpc_dataset = MRPCDataset(args)
    train_dataloader, train_size = mrpc_dataset.get_train_dataloader()
    eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader()
    for epoch in range(args.num_train_epochs):
        logger.info("***** Epoch {} *****".format(epoch + 1))
        train(train_dataloader, model, optimizer)
        mge.save(model.state_dict(), args.save_model_path)
        eval(eval_dataloader, model)
Ejemplo n.º 13
0
import numpy as np
import time
from model import BertForSequenceClassification
from dataloader_training import dataloader
import torch
import math
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import os
import os.path
validation_dataloader = dataloader()
device = torch.device("cuda")
model = BertForSequenceClassification(3)
model.cuda()


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


resume_weights = './checkpoint_4.pth.tar'
if os.path.isfile(resume_weights):
    if device:
        checkpoint = torch.load(resume_weights)

    start_epoch = checkpoint['epoch']
    best_accuracy = checkpoint['best_accuracy']
    model.load_state_dict(checkpoint['state_dict'])