def main():
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)
    parser.add_argument('--predict_file', type=str, default='')
    parser.add_argument('--out_file', type=str, default='')

    args = parser.parse_args()

    data_processor = DataProcessor(task=args.task_name)
    label_list = data_processor.get_labels()
    num_labels = len(label_list) + 1  # add one for IGNORE label

    model_cls = XLMRForTokenClassification

    hidden_size = 768 if 'base' in args.pretrained_path else 1024  # TODO: move this inside model.__init__
    device = 'cuda' if (torch.cuda.is_available()
                        and not args.no_cuda) else 'cpu'

    # creating model
    model = model_cls(pretrained_path=args.pretrained_path,
                      n_labels=num_labels,
                      hidden_size=hidden_size,
                      dropout_p=args.dropout,
                      device=device)

    # load best/ saved model
    state_dict = torch.load(open(args.load_model, 'rb'))
    model.load_state_dict(state_dict)
    logger.info("Loaded saved model")

    model.to(device)

    pred_examples = data_processor.get_pred_examples(args.predict_file)
    pred_features = data_processor.convert_examples_to_features(
        pred_examples, label_list, 320, model.encode_word)

    pred_data = create_ner_dataset(pred_features)
    f1_score, report, y_true, y_pred = evaluate_model(model,
                                                      pred_data,
                                                      label_list,
                                                      args.eval_batch_size,
                                                      args.use_crf,
                                                      device,
                                                      pred=True)

    logger.info("\n%s", report)
    output_pred_file = args.out_file
    with open(output_pred_file, "w") as writer:
        for ex, pred in zip(pred_examples, y_pred):
            writer.write("Ex text: {}\n".format(ex.text))
            writer.write("Ex labels: {}\n".format(ex.label))
            writer.write("Ex preds: {}\n".format(pred))

            writer.write("*******************************\n")
Beispiel #2
0
    #fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, sharex=True)
    #ax0.errorbar(y, np.mean(f1, axis=1), yerr=get_ass_inclination(f1), fmt='-o')
    #ax0.set_title('f1, symmetric error')

    #ax1.errorbar(y, np.mean(precision, axis=1), yerr=get_ass_inclination(precision), fmt='-o')
    #ax1.set_title('Accuracy, symmetric error')

    #ax2.errorbar(y, np.mean(auc, axis=1), yerr=get_ass_inclination(auc), fmt='-o')
    #ax2.set_title('AUC, symmetric error')

    #plt.savefig('foo.png')


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)

    parser.add_argument(
        "--divider",
        default=0.1,
        type=float,
        required=False,
        help=
        "Training set will be divided into multiplicity of given divider until it reaches 90/10/10 split "
    )
    parser.add_argument("--reps",
                        default=10,
                        type=int,
                        required=True,
                        help="Repetitions per division")
    args = parser.parse_args()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)
    parser.add_argument('--self_training', action='store_true', default=False)
    parser.add_argument('--unlabeled_data_dir',
                        type=str,
                        default='data/unlabeled_data')
    parser.add_argument('--self_training_confidence', type=float, default=0.9)
    parser.add_argument('--K', type=float, default=50)
    parser.add_argument('--patience', type=float, default=10)

    args = parser.parse_args()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    data_processor = SequenceLabelingProcessor(task=args.task_name)
    label_list = data_processor.get_labels()
    num_labels = len(label_list) + 1  # add one for IGNORE label

    train_examples = None
    num_train_optimization_steps = 0

    if args.do_train:
        train_examples = data_processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs

        # preparing model configs
    hidden_size = 768 if 'base' in args.pretrained_path else 1024  # TODO: move this inside model.__init__

    device = 'cuda' if (torch.cuda.is_available()
                        and not args.no_cuda) else 'cpu'

    if args.use_crf:
        model_cls = XLMRForTokenClassificationWithCRF
    else:
        model_cls = XLMRForTokenClassification

    # creating model
    model = model_cls(pretrained_path=args.pretrained_path,
                      n_labels=num_labels,
                      hidden_size=hidden_size,
                      dropout_p=args.dropout,
                      device=device)

    model.to(device)

    if args.load_model is not None:
        logging.info("Loading saved model {}".format(args.load_model))
        state_dict = torch.load(args.load_model)
        model.load_state_dict(state_dict, strict=True)

    no_decay = ['bias', 'final_layer_norm.weight']

    params = list(model.named_parameters())

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in params if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [p for n, p in params if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)

    # freeze model if necessary
    if args.freeze_model:
        logger.info("Freezing XLM-R model...")
        for n, p in model.named_parameters():
            if 'xlmr' in n and p.requires_grad:
                p.requires_grad = False

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = data_processor.convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, model.encode_word)

        if args.self_training:
            self_training_examples = data_processor.get_unlabeled_examples(
                args.unlabeled_data_dir)
            self_training_features = data_processor.convert_examples_to_features(
                self_training_examples, label_list, args.max_seq_length,
                model.encode_word)

            logging.info("Loaded {} Unlabeled examples".format(
                len(self_training_examples)))

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_ner_dataset(train_features)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        val_examples = data_processor.get_dev_examples(args.data_dir)
        val_features = data_processor.convert_examples_to_features(
            val_examples, label_list, args.max_seq_length, model.encode_word)

        val_data = create_ner_dataset(val_features)
        best_val_f1 = 0.0

        ############################# Self Training Loop ######################
        n_iter = 0
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=num_train_optimization_steps)
        patience = 0
        while 1:

            ############################ Inner Training Loop #####################

            #if n_iter >= 50:
            #    break

            # reset lr

            n_iter += 1

            print(len(train_dataloader))
            loss_fct = nn.BCELoss()
            for epoch_ in tqdm(range(args.num_train_epochs),
                               desc="Epoch",
                               disable=args.no_pbar):

                tr_loss = 0
                tbar = tqdm(train_dataloader,
                            desc="Iteration",
                            disable=args.no_pbar)

                model.train()
                for step, batch in enumerate(tbar):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, label_ids, l_mask, valid_ids, = batch
                    loss, _ = model(input_ids,
                                    label_ids,
                                    l_mask,
                                    valid_ids,
                                    get_sent_repr=True)

                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    tr_loss += loss.item()
                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        optimizer.zero_grad()
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        optimizer.step()
                        scheduler.step()  # Update learning rate schedule
                        model.zero_grad()

                    tbar.set_description('Loss = %.4f' % (tr_loss /
                                                          (step + 1)))
                logger.info("Evaluating on validation set...\n")
                #torch.save(model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb'))
                f1, report = evaluate_model_seq_labeling(
                    model, val_data, label_list, args.eval_batch_size,
                    args.use_crf, device)
                if f1 > best_val_f1:
                    best_val_f1 = f1
                    logger.info(
                        "\nFound better f1=%.4f on validation set. Saving model\n"
                        % (f1))
                    logger.info("\n%s\n" % (report))

                    torch.save(
                        model.state_dict(),
                        open(os.path.join(args.output_dir, 'model.pt'), 'wb'))
                    patience = 0

                else:
                    logger.info("\nNo better F1 score: {}\n".format(f1))
                    patience += 1

            ######################################################################
            if not args.self_training:
                break
            if patience >= args.patience:
                logger.info("No more patience. Existing")
                break
            ## get confidence and update train_data, train_dataloader
            # convert unlabeled examples to features

            if len(self_training_features) <= 0:  # no more self-training data
                break

            confident_features, self_training_features = get_top_confidence_samples_seq_labeling(
                model,
                self_training_features,
                batch_size=args.eval_batch_size,
                K=args.K)

            for f in confident_features:
                l_ids = f.label_id
                l_s = [label_map[i] for i in l_ids]
            logging.info("Got %d confident samples" %
                         (len(confident_features)))
            # append new features
            #train_features = data_processor.convert_examples_to_features(
            #         train_examples, label_list, args.max_seq_length, model.encode_word)

            train_features.extend(confident_features)

            print("now we have %d total examples" % len(train_features))

            train_data = create_ner_dataset(train_features)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data,
                                          sampler=train_sampler,
                                          batch_size=args.train_batch_size)

            for g in optimizer.param_groups:
                g['lr'] = args.learning_rate

            scheduler.step(0)

            #print("Loading best last model...")
            #model.load_state_dict(torch.load(open(os.path.join(args.output_dir, 'model.pt'), 'rb')))

    # load best/ saved model
    state_dict = torch.load(
        open(os.path.join(args.output_dir, 'model.pt'), 'rb'))
    model.load_state_dict(state_dict)
    logger.info("Loaded saved model")

    model.to(device)

    if args.do_eval:
        if args.eval_on == "dev":
            eval_examples = data_processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = data_processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = data_processor.convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, model.encode_word)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        eval_data = create_ner_dataset(eval_features)
        f1_score, report = evaluate_model_seq_labeling(model, eval_data,
                                                       label_list,
                                                       args.eval_batch_size,
                                                       args.use_crf, device)

        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        logger.info("dataset = {}".format(args.data_dir))
        logger.info("model = {}".format(args.output_dir))
        with open(output_eval_file, "w") as writer:
            logger.info("***** Writing results to file *****")
            writer.write(report)
            logger.info("Done.")
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)

    args = parser.parse_args()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = NerProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1  # add one for IGNORE label

    train_examples = None
    num_train_optimization_steps = 0

    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs

    # preparing model configs
    hidden_size = 768 if 'base' in args.pretrained_path else 1024  # TODO: move this inside model.__init__

    device = 'cuda' if (torch.cuda.is_available()
                        and not args.no_cuda) else 'cpu'

    # creating model
    model = XLMRForTokenClassification(pretrained_path=args.pretrained_path,
                                       n_labels=num_labels,
                                       hidden_size=hidden_size,
                                       dropout_p=0.2,
                                       device=device)
    #-- dropout 0.2
    model.to(device)
    no_decay = ['bias', 'final_layer_norm.weight']

    params = list(model.named_parameters())

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in params if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [p for n, p in params if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)

    # freeze model if necessary
    if args.freeze_model:
        logger.info("Freezing XLM-R model...")
        for n, p in model.named_parameters():
            if 'xlmr' in n and p.requires_grad:
                p.requires_grad = False

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      model.encode_word)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_dataset(train_features)

        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        # getting validation samples
        val_examples = processor.get_dev_examples(args.data_dir)
        val_features = convert_examples_to_features(val_examples, label_list,
                                                    args.max_seq_length,
                                                    model.encode_word)

        val_data = create_dataset(val_features)

        best_val_f1 = 0.0

        for _ in tqdm(range(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            tbar = tqdm(train_dataloader, desc="Iteration")

            model.train()
            for step, batch in enumerate(tbar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, label_ids, l_mask, valid_ids, = batch
                loss = model(input_ids, label_ids, l_mask, valid_ids)

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                tbar.set_description('Loss = %.4f' % (tr_loss / (step + 1)))

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

            logger.info("\nTesting on validation set...")
            f1, report = evaluate_model(model, val_data, label_list,
                                        args.eval_batch_size, device)
            if f1 > best_val_f1:
                best_val_f1 = f1
                logger.info(
                    "\nFound better f1=%.4f on validation set. Saving model\n"
                    % (f1))
                logger.info("%s\n" % (report))

                torch.save(
                    model.state_dict(),
                    open(os.path.join(args.output_dir, 'model.pt'), 'wb'))

            else:
                logger.info("\nNo better F1 score: {}\n".format(f1))
    else:  # load a saved model
        state_dict = torch.load(
            open(os.path.join(args.output_dir, 'model.pt'), 'rb'))
        model.load_state_dict(state_dict)
        logger.info("Loaded saved model")

    model.to(device)

    if args.do_eval:
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     model.encode_word)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        eval_data = create_dataset(eval_features)
        f1_score, report = evaluate_model(model, eval_data, label_list,
                                          args.eval_batch_size, device)

        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Writing results to file *****")
            writer.write(report)
            logger.info("Done.")
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)

    args = parser.parse_args()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = en_fr_processor()

    train_examples = processor.get_train_examples(args.data_dir)

    # preparing model configs
    hidden_size = 768 if 'base' in args.pretrained_path else 1024  # TODO: move this inside model.__init__

    device = 'cuda' if (torch.cuda.is_available()
                        and not args.no_cuda) else 'cpu'

    # creating model
    model = XLMR_Encoder_Decoder(pretrained_path=args.pretrained_path,
                                 hidden_size=hidden_size,
                                 dropout_p=args.dropout,
                                 device=device)

    model.encoder.to(device)
    model.decoder.to(device)

    params = model.encoder.named_parameters() + model.decoder.named_parameters(
    )

    optimizer_grouped_parameters = [{'params': [p for n, p in params]}]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=1, t_total=1)

    train_features = convert_examples_to_features(train_examples,
                                                  args.max_seq_length,
                                                  model.encoder.encode_word)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    #logger.info("  Num steps = %d", num_train_optimization_steps)

    train_data = create_dataset(train_features)

    train_sampler = RandomSampler(train_data)

    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    for _ in tqdm(range(args.num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        tbar = tqdm(train_dataloader, desc="Iteration")

        model.encoder.train()
        for step, batch in enumerate(tbar):
            batch = tuple(t.to(device) for t in batch)
            src_tensor, target_tensor = batch
            enc_out = model.encoder(src_tensor)
            torch.nn.utils.clip_grad_norm_(model.encoder.parameters(),
                                           args.max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.encoder.zero_grad()

    model.encoder.to(device)