コード例 #1
0
def main(num_examples, epochs, from_cache):
    print('Start train process with below properties:')
    print('Number of examples: {}'.format(num_examples))
    print('Train epochs: {}'.format(epochs))
    if from_cache:
        print('Load data from stage')
        X_train = load_np_data('X_train.npy')
        y_train = load_np_data('y_train.npy')
        X_test = load_np_data('X_test.npy')
        y_test = load_np_data('y_test.npy')
    else:
        download_datasets(labels)
        X_train, X_test, y_train, y_test = prepare_datasets(
            labels, num_examples)
        print('Generate new data and join to train dataset')
        X_train, y_train = join_transformed_images(X_train, y_train)

    train = torch.from_numpy(X_train).float()
    train_labels = torch.from_numpy(y_train).long()
    test = torch.from_numpy(X_test).float()
    test_labels = torch.from_numpy(y_test).long()

    print('Build model')
    model = build_model(input_size, output_size, hidden_sizes, dropout)
    print('Start fitting')
    fit_model(model,
              train,
              train_labels,
              epochs=epochs,
              n_chunks=n_chunks,
              learning_rate=learning_rate,
              weight_decay=weight_decay)
    evaluate_model(model, train, train_labels, test, test_labels)
    metainfo = {
        'input_size': input_size,
        'output_size': output_size,
        'hidden_layers': hidden_sizes,
        'dropout': dropout,
        'state_dict': model.state_dict()
    }
    print('End fit')
    torch.save(metainfo, model_path)
    print("Model saved to {}\n".format(model_path))
コード例 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)
    parser.add_argument('--predict_file', type=str, default='')
    parser.add_argument('--out_file', type=str, default='')

    args = parser.parse_args()

    data_processor = DataProcessor(task=args.task_name)
    label_list = data_processor.get_labels()
    num_labels = len(label_list) + 1  # add one for IGNORE label

    model_cls = XLMRForTokenClassification

    hidden_size = 768 if 'base' in args.pretrained_path else 1024  # TODO: move this inside model.__init__
    device = 'cuda' if (torch.cuda.is_available()
                        and not args.no_cuda) else 'cpu'

    # creating model
    model = model_cls(pretrained_path=args.pretrained_path,
                      n_labels=num_labels,
                      hidden_size=hidden_size,
                      dropout_p=args.dropout,
                      device=device)

    # load best/ saved model
    state_dict = torch.load(open(args.load_model, 'rb'))
    model.load_state_dict(state_dict)
    logger.info("Loaded saved model")

    model.to(device)

    pred_examples = data_processor.get_pred_examples(args.predict_file)
    pred_features = data_processor.convert_examples_to_features(
        pred_examples, label_list, 320, model.encode_word)

    pred_data = create_ner_dataset(pred_features)
    f1_score, report, y_true, y_pred = evaluate_model(model,
                                                      pred_data,
                                                      label_list,
                                                      args.eval_batch_size,
                                                      args.use_crf,
                                                      device,
                                                      pred=True)

    logger.info("\n%s", report)
    output_pred_file = args.out_file
    with open(output_pred_file, "w") as writer:
        for ex, pred in zip(pred_examples, y_pred):
            writer.write("Ex text: {}\n".format(ex.text))
            writer.write("Ex labels: {}\n".format(ex.label))
            writer.write("Ex preds: {}\n".format(pred))

            writer.write("*******************************\n")
コード例 #3
0
    def evaluate(self, pretrained_path, dropout, path_model, device, num_labels,
             data_path, label_list, max_seq_length=128, squeeze=True, eval_batch_size=32, model_name="XLMR"):
        hidden_size = 768 if 'base' in pretrained_path else 1024
        if model_name == 'HERBERT':
            model = AutoTokenizerForTokenClassification(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'BERT_MULTILINGUAL':
            model = BertBaseMultilingualCased(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'Reformer':
            model = Reformer(n_labels=num_labels, hidden_size=512,
                             dropout=dropout, device=device, max_seq_length=max_seq_length,
                             batch_size=train_batch_size)
        else:
            model = XLMRForTokenClassification(pretrained_path=pretrained_path,
                                n_labels=num_labels, hidden_size=hidden_size,
                                dropout=dropout, device=device)
        output_dir = path_model
        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                datefmt='%m/%d/%Y %H:%M:%S',
                level=logging.INFO,
                filename=os.path.join(output_dir, "log.txt"))
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
        logger = logging.getLogger(__name__)
        state_dict = torch.load(open(os.path.join(path_model, 'model.pt'), 'rb'))
        model.load_state_dict(state_dict)
        logger.info("Loaded saved model")

        model.to(device)
        if not split_train_data:
            eval_examples, _ = get_examples(data_path)

        eval_features = convert_examples_to_features(
            eval_examples, label_list, max_seq_length, model.encode_word)
        
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)
        eval_data = create_dataset(eval_features)
        f1_score, report = evaluate_model(model, eval_data, label_list, eval_batch_size, device)

        logger.info("\n%s", report)
        output_eval_file = os.path.join(output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Writing results to file *****")
            writer.write(report)
            logger.info("Done.")
コード例 #4
0
    def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed,
              epochs, data_path, pretrained_path, valid_path, no_cuda=False, dropout=0.3,
              weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8,
              max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False,
              model_name='BERT', embedding_path=None, split_train_data=False, motherfile = False):
        if os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory (%s) already exists and is not empty." % output_dir)
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO,
                        filename=os.path.join(output_dir, "log.txt"))
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
        logger = logging.getLogger(__name__)

        if gradient_accumulation_steps < 1:
            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
                         % gradient_accumulation_steps)

        train_batch_size = train_batch_size // gradient_accumulation_steps
    
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # add one for IGNORE label
        if motherfile:
            print(data_path)
            train_examples, train_label_list = get_examples_from_motherfile(data_path, 'train')
            val_examples, val_label_list = get_examples_from_motherfile(data_path, 'test')
            train_label_list.extend(val_label_list)
            label_list = list(set(train_label_list))
        elif split_train_data:
            examples, label_list = get_examples(data_path, 'train')
            random.shuffle(examples)
            train_examples = examples[0:int(len(examples)*0.6)]
            val_examples = examples[int(len(examples)*0.6):int(len(examples)*0.8)]
            eval_examples = examples[int(len(examples)*0.8):]
        else:
            train_examples = None
            train_examples, label_list = get_examples(data_path, 'train')
        num_train_optimization_steps = 0
        num_labels = len(label_list) + 1
        num_train_optimization_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs
        
        hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024
        device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu'
        logger.info(device)
        print(pretrained_path)
        if model_name == 'HERBERT':
            model = AutoTokenizerForTokenClassification(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'BERT_MULTILINGUAL':
            model = BertBaseMultilingualCased(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'Reformer':
            model = Reformer(n_labels=num_labels, hidden_size=512,
                             dropout=dropout, device=device, max_seq_length=max_seq_length,
                             batch_size=train_batch_size)
        else:
            model = XLMRForTokenClassification(pretrained_path=pretrained_path,
                                n_labels=num_labels, hidden_size=hidden_size,
                                dropout=dropout, device=device)

        model.to(device)
        no_decay = ['bias', 'final_layer_norm.weight']

        params = list(model.named_parameters())

        optimizer_grouped_parameters = [
            {'params': [p for n, p in params if not any(
                nd in n for nd in no_decay)], 'weight_decay': weight_decay},
            {'params': [p for n, p in params if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        warmup_steps = int(warmup_proportion * num_train_optimization_steps)
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)

        train_features = convert_examples_to_features(
            train_examples, label_list, max_seq_length, model.encode_word)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_dataset(train_features)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(
            train_data, sampler=train_sampler, batch_size=train_batch_size)
        if not split_train_data:
            val_examples, _ = get_examples(valid_path, 'valid')
        val_features = convert_examples_to_features(
            val_examples, label_list, max_seq_length, model.encode_word)

        val_data = create_dataset(val_features)
        
        best_val_f1 = 0.0

        for epoch_no in range(1, epochs+1):
            logger.info("Epoch %d" % epoch_no)
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            
            model.train()
            steps = len(train_dataloader)
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, label_ids, l_mask, valid_ids, = batch
                loss = model(input_ids, label_ids, l_mask, valid_ids)
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if step % 5 == 0:
                    logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1)))
                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

            logger.info("\nTesting on validation set...")
            f1, report = evaluate_model(model, val_data, label_list, eval_batch_size, device)
            print(report)
            if f1 > best_val_f1:
                best_val_f1 = f1
                logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1)
                logger.info("%s\n" % report)
                torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb'))
                save_params(output_dir, dropout, num_labels, label_list)

            if epoch_save_model:
                epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no)
                os.makedirs(epoch_output_dir)
                torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb'))
                save_params(epoch_output_dir, dropout, num_labels, label_list)
コード例 #5
0
ファイル: main.py プロジェクト: smutuvi/NER_XLM_Roberta
def main():
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)

    args = parser.parse_args()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = NerProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1  # add one for IGNORE label

    train_examples = None
    num_train_optimization_steps = 0

    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs

    # preparing model configs
    hidden_size = 768 if 'base' in args.pretrained_path else 1024  # TODO: move this inside model.__init__

    device = 'cuda' if (torch.cuda.is_available()
                        and not args.no_cuda) else 'cpu'

    # creating model
    model = XLMRForTokenClassification(pretrained_path=args.pretrained_path,
                                       n_labels=num_labels,
                                       hidden_size=hidden_size,
                                       dropout_p=0.2,
                                       device=device)
    #-- dropout 0.2
    model.to(device)
    no_decay = ['bias', 'final_layer_norm.weight']

    params = list(model.named_parameters())

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in params if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [p for n, p in params if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)

    # freeze model if necessary
    if args.freeze_model:
        logger.info("Freezing XLM-R model...")
        for n, p in model.named_parameters():
            if 'xlmr' in n and p.requires_grad:
                p.requires_grad = False

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      model.encode_word)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_dataset(train_features)

        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        # getting validation samples
        val_examples = processor.get_dev_examples(args.data_dir)
        val_features = convert_examples_to_features(val_examples, label_list,
                                                    args.max_seq_length,
                                                    model.encode_word)

        val_data = create_dataset(val_features)

        best_val_f1 = 0.0

        for _ in tqdm(range(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            tbar = tqdm(train_dataloader, desc="Iteration")

            model.train()
            for step, batch in enumerate(tbar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, label_ids, l_mask, valid_ids, = batch
                loss = model(input_ids, label_ids, l_mask, valid_ids)

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                tbar.set_description('Loss = %.4f' % (tr_loss / (step + 1)))

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

            logger.info("\nTesting on validation set...")
            f1, report = evaluate_model(model, val_data, label_list,
                                        args.eval_batch_size, device)
            if f1 > best_val_f1:
                best_val_f1 = f1
                logger.info(
                    "\nFound better f1=%.4f on validation set. Saving model\n"
                    % (f1))
                logger.info("%s\n" % (report))

                torch.save(
                    model.state_dict(),
                    open(os.path.join(args.output_dir, 'model.pt'), 'wb'))

            else:
                logger.info("\nNo better F1 score: {}\n".format(f1))
    else:  # load a saved model
        state_dict = torch.load(
            open(os.path.join(args.output_dir, 'model.pt'), 'rb'))
        model.load_state_dict(state_dict)
        logger.info("Loaded saved model")

    model.to(device)

    if args.do_eval:
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     model.encode_word)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        eval_data = create_dataset(eval_features)
        f1_score, report = evaluate_model(model, eval_data, label_list,
                                          args.eval_batch_size, device)

        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Writing results to file *****")
            writer.write(report)
            logger.info("Done.")