コード例 #1
0
def main(paras):

    logger = logging.getLogger(__name__)
    if paras.save_log_file:
        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S',
                            level=paras.logging_level,
                            filename=f'{paras.log_save_path}/{paras.train_log_file}',
                            filemode='w')
    else:
        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S',
                            level=paras.logging_level, )

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    logger.info(f'Loading model: {paras.model_name}')
    tokenizer = BertTokenizer.from_pretrained(paras.model_name)
    bert = BertModel.from_pretrained(paras.model_name)


    train_dataset = RE_Dataset(paras, 'train')
    train_dataloaer = DataLoader(train_dataset, batch_size=paras.batch_size,
                                 shuffle=paras.shuffle, drop_last=paras.drop_last)
    label_to_index = train_dataset.label_to_index
    special_token_list = list(train_dataset.special_token_set)
    # fixme: add special token to tokenizer
    special_tokens_dict = {'additional_special_tokens': special_token_list}
    tokenizer.add_special_tokens(special_tokens_dict)
    # bert.resize_token_embeddings(len(tokenizer))

    test_dataset = RE_Dataset(paras, 'test')
    test_dataloader = DataLoader(test_dataset, batch_size=paras.batch_size,
                                 shuffle=paras.shuffle, drop_last=paras.drop_last)

    bert_classifier = BertClassifier(bert, paras.hidden_size, paras.label_number,
                                     paras.dropout_prob)

    if paras.optimizer == 'adam':
        logger.info('Loading Adam optimizer.')
        optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=paras.learning_rate)
    elif paras.optimizer == 'adamw':
        logger.info('Loading AdamW optimizer.')
        no_decay = [ 'bias', 'LayerNorm.weight' ]
        optimizer_grouped_parameters = [
            {'params': [ p for n, p in bert_classifier.named_parameters() if not any(nd in n for nd in no_decay) ],
             'weight_decay': 0.01},
            {'params': [ p for n, p in bert_classifier.named_parameters() if any(nd in n for nd in no_decay) ],
             'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=paras.learning_rate,
                          eps=args.adam_epsilon)
    else:
        logger.warning(f'optimizer must be "Adam" or "AdamW", but got {paras.optimizer}.')
        logger.info('Loading Adam optimizer.')
        optimizer = torch.optim.Adam(bert_classifier.parameters(),
                                     lr=paras.learning_rate)


    logger.info('Training Start.')
    best_eval = {'acc': 0, 'precision': 0, 'recall': 0, 'f1': 0, 'loss': 0}
    for epoch in range(paras.num_train_epochs):
        epoch_loss = 0
        bert_classifier.train()
        for step, batch in enumerate(train_dataloaer):
            optimizer.zero_grad()

            batch_data, batch_label = batch

            encoded_data = tokenizer(batch_data,
                                     padding=True,
                                     truncation=True,
                                     return_tensors='pt',
                                     max_length=paras.max_sequence_length)

            label_tensor = batch_label_to_idx(batch_label, label_to_index)

            loss = bert_classifier(encoded_data, label_tensor)

            epoch_loss += loss_to_int(loss)

            logging.info(f'epoch: {epoch}, step: {step}, loss: {loss:.4f}')

            # fixme: del
            # acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader,
            #                                         paras.max_sequence_length, label_to_index)
            # logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, '
            #             f'Recall: {recall:.4f}, F1-score: {f1:.4f}')

            loss.backward()
            optimizer.step()

        epoch_loss = epoch_loss / len(train_dataloaer)

        acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader,
                                                paras.max_sequence_length, label_to_index)

        logging.info(f'Epoch: {epoch}, Epoch-Average Loss: {epoch_loss:.4f}')
        logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, '
                    f'Recall: {recall:.4f}, F1-score: {f1:.4f}')

        if best_eval['loss'] == 0 or f1 > best_eval['f1']:
            best_eval['loss'] = epoch_loss
            best_eval['acc'] = acc
            best_eval['precision'] = precision
            best_eval['recall'] = recall
            best_eval['f1'] = f1
            torch.save(bert_classifier, f'{paras.log_save_path}/{paras.model_save_name}')

            with open(f'{paras.log_save_path}/{paras.checkpoint_file}', 'w') as wf:
                wf.write(f'Save time: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}\n')
                wf.write(f'Best F1-score: {best_eval["f1"]:.4f}\n')
                wf.write(f'Precision: {best_eval["precision"]:.4f}\n')
                wf.write(f'Recall: {best_eval["recall"]:.4f}\n')
                wf.write(f'Accuracy: {best_eval["acc"]:.4f}\n')
                wf.write(f'Epoch-Average Loss: {best_eval["loss"]:.4f}\n')

            logger.info(f'Updated model, best F1-score: {best_eval["f1"]:.4f}\n')

    logger.info(f'Train complete, Best F1-score: {best_eval["f1"]:.4f}.')
コード例 #2
0
ファイル: train.py プロジェクト: pchlq/Tech4MentalHealth
def run():
    def collate_fn(
            batch: List[Tuple[torch.LongTensor, torch.LongTensor]],
            device: torch.device) -> Tuple[torch.LongTensor, torch.LongTensor]:

        x, y = list(zip(*batch))
        x = pad_sequence(x, batch_first=True, padding_value=0)
        y = torch.stack(y)
        return x.to(device), y.to(device)

    df = pd.read_csv("../inputs/Train.csv")
    # test = pd.read_csv("../inputs/Test.csv")

    train_df, val_df = train_test_split(df,
                                        stratify=df.label,
                                        test_size=VALID_SIZE,
                                        random_state=SEED)

    labels = ["Depression", "Alcohol", "Suicide", "Drugs"]
    train = pd.concat([train_df["text"], pd.get_dummies(train_df['label'])\
               .reindex(columns=labels)], axis=1)#.reset_index(drop=True)

    valid = pd.concat([val_df["text"], pd.get_dummies(val_df['label'])\
               .reindex(columns=labels)], axis=1)#.reset_index(drop=True)

    if DEVICE == 'cpu':
        print('cpu')
    else:
        n_gpu = torch.cuda.device_count()
        print(torch.cuda.get_device_name(0))

    train_dataset = MentalHealthDataset(config.TOKENIZER, train, lazy=True)
    valid_dataset = MentalHealthDataset(config.TOKENIZER, valid, lazy=True)
    collate_fn = partial(collate_fn, device=DEVICE)

    train_sampler = RandomSampler(train_dataset)
    valid_sampler = RandomSampler(valid_dataset)

    train_iterator = DataLoader(train_dataset,
                                batch_size=config.TRAIN_BATCH_SIZE,
                                sampler=train_sampler,
                                collate_fn=collate_fn)

    valid_iterator = DataLoader(valid_dataset,
                                batch_size=config.VALID_BATCH_SIZE,
                                sampler=valid_sampler,
                                collate_fn=collate_fn)

    # model = BertClassifier().to(DEVICE)
    model = BertClassifier(BertModel.from_pretrained(config.BERT_PATH),
                           4).to(DEVICE)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    # triangular learning rate, linearly grows untill half of first epoch, then linearly decays
    warmup_steps = 10**3  # 10 ** 3
    total_steps = len(train_iterator) * config.EPOCHS - warmup_steps
    optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps,
                                                total_steps)
    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps)

    # optimizer = torch.optim.Adam(model.parameters(), lr=LR) # 1e-4)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min",
    #                                         patience=5, factor=0.3, min_lr=1e-10, verbose=True)

    for epoch in range(config.EPOCHS):
        print('=' * 5, f"EPOCH {epoch}", '=' * 5)
        engine.train_fn(train_iterator, model, optimizer, scheduler)
        engine.eval_fn(valid_iterator, model)

    model.eval()
    test_df = pd.read_csv("../inputs/Test.csv")
    submission = pd.read_csv('../inputs/SampleSubmission.csv')
    res = np.zeros((submission.shape[0], len(labels)))

    for i in tqdm(range(len(test_df) // config.TRAIN_BATCH_SIZE + 1)):
        batch_df = test_df.iloc[i * config.TRAIN_BATCH_SIZE:(i + 1) *
                                config.TRAIN_BATCH_SIZE]
        assert (batch_df["ID"] == submission["ID"]
                [i * config.TRAIN_BATCH_SIZE:(i + 1) *
                 config.TRAIN_BATCH_SIZE]).all(), f"Id mismatch"
        texts = []
        for text in batch_df["text"].tolist():
            text = config.TOKENIZER.encode(text, add_special_tokens=True)
            if len(text) > config.MAX_LEN:
                text = text[:config.MAX_LEN -
                            1] + [config.TOKENIZER.sep_token_id]
            texts.append(torch.LongTensor(text))
        x = pad_sequence(
            texts,
            batch_first=True,
            padding_value=config.TOKENIZER.pad_token_id).to(DEVICE)
        mask = (x != config.TOKENIZER.pad_token_id).float().to(DEVICE)

        with torch.no_grad():
            _, outputs = model(x, attention_mask=mask)
        outputs = outputs.cpu().numpy()
        submission.loc[i *
                       config.TRAIN_BATCH_SIZE:(i * config.TRAIN_BATCH_SIZE +
                                                len(outputs) - 1),
                       labels] = outputs

    submission.to_csv("../subs/submission_2.csv", index=False)