Esempio n. 1
0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
TEXT, LABEL, train, valid, test, train_iter, valid_iter, test_iter = load_file(filepath='data/',
                                                                               device=device)

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = SentimentModel(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

optimizer = torch.optim.SGD(model.parameters(), lr=3e-3)

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y):
    
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
def predict():
    # kfold type of data input
    data = pd.read_csv(config.TEST_FILE)
    data['Label_encoded'] = 0
    data['Sentiment_encoded'] = 0
    df_test = data

    test_data = CommentData(comments=df_test['Comment'],
                            labels=df_test['Label_encoded'],
                            sentiments=df_test['Sentiment_encoded'])

    test_dataloader = torch.utils.data.DataLoader(
        test_data,
        batch_size=config.TEST_BATCH_SIZE,
        # num_workers = 4
    )

    # model
    device = torch.device('cuda')

    model_config = BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True

    model0 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model0.to(device)
    # model0 = nn.DataParallel(model0)
    model0.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_0.bin'))
    model0.eval()

    model1 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model1.to(device)
    # model1 = nn.DataParallel(model1)
    model1.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_1.bin'))
    model1.eval()

    model2 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model2.to(device)
    # model2 = nn.DataParallel(model2)
    model2.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_2.bin'))
    model2.eval()

    model3 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model3.to(device)
    # model3 = nn.DataParallel(model3)
    model3.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_3.bin'))
    model3.eval()

    model4 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model4.to(device)
    # model4 = nn.DataParallel(model4)
    model4.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_4.bin'))
    model4.eval()

    # process raw output
    model_prediction = []

    with torch.no_grad():
        tq0 = tqdm(test_dataloader, total=len(test_dataloader))
        for bi, data in tqdm(enumerate(tq0)):
            # load data / ready to input
            input_ids = data['input_ids']
            token_type_ids = data['token_type_ids']
            attention_mask = data['attention_mask']

            label = data['label']
            sentiment = data['sentiment']

            # prepare input data
            input_ids = input_ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            attention_mask = attention_mask.to(device, dtype=torch.long)

            label = label.to(device, dtype=torch.long)
            sentiment = sentiment.to(device, dtype=torch.long)

            # forward(self, ids, mask, type_ids)

            out0 = model0(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out1 = model1(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out2 = model2(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out3 = model3(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out4 = model4(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out = (out0 + out1 + out2 + out3 + out4) / 5
            out = torch.softmax(out, dim=1).cpu().detach().numpy()

            for ix, result in enumerate(out):
                pred = np.argmax(result)
                model_prediction.append(pred)

    sample = pd.read_csv(config.TEST_FILE)
    sample['sentiment_pred'] = model_prediction
    sample.to_csv(config.OUTPUT_PATH + '/pred_sentiment.csv', index=False)
Esempio n. 3
0
def run(fold=0):
    # kfold type of data input
    data = pd.read_csv(config.TRAIN_FOLDS_FILE)
    df_train = data[data['kfold'] != fold].reset_index(drop=True)
    df_valid = data[data['kfold'] == fold].reset_index(drop=True)

    train_data = CommentData(comments=df_train['Comment'],
                             labels=df_train['Label_encoded'],
                             sentiments=df_train['Sentiment_encoded'])

    train_dataloader = torch.utils.data.DataLoader(
        train_data,
        batch_size=config.TRAIN_BATCH_SIZE,
        # num_workers = 4
    )

    valid_data = CommentData(comments=df_valid['Comment'],
                             labels=df_valid['Label_encoded'],
                             sentiments=df_valid['Sentiment_encoded'])

    valid_dataloader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=config.VALID_BATCH_SIZE,
        # num_workers = 4
    )

    device = torch.device('cuda')

    model_config = RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True

    model = SentimentModel(model_config, config.OUTPUT_SIZE)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # train_fn(data_loader, model, device, optimizer, scheduler=None)
    train_loss_rec = []
    eval_loss_rec = []

    early_stopping = utils.EarlyStopping(patience=5, mode='min')

    for epoch in range(config.EPOCHS):
        print(f'########### fold = {fold} epoch = {epoch} ############')
        loss_train = engine.train_fn(data_loader=train_dataloader,
                                     model=model,
                                     device=device,
                                     optimizer=optimizer,
                                     scheduler=scheduler)

        train_loss_rec.append(loss_train)

        losses_eval = engine.eval_fn(valid_dataloader, model, device)
        eval_loss_rec.append(losses_eval)

        print(f'train_loss = {loss_train}  eval_loss = {losses_eval}')
        # print(f'save model_{fold}.bin')
        # torch.save(model.state_dict(), config.OUTPUT_PATH + f'/model_{fold}.bin')
        early_stopping(losses_eval,
                       model,
                       model_path=config.OUTPUT_PATH +
                       f'/model_label_{fold}.bin')
        if early_stopping.early_stop:
            print('Early stopping')
            break
Esempio n. 4
0
def preprocess_and_train():
    # read dataset
    data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
    data.columns = ('target','uid', 'time', 'query', 'user', 'text')

    # create new dataframe
    sent_df = pd.DataFrame(None, columns=('target', 'text'))
    sent_df['target'] = data['target']
    sent_df['text'] = data['text'].apply(preprocess_text)
    sent_df['tweet_size'] = data['text'].apply(lambda x:len(x.split()))

    # select random sample of 400,000 tweets from total dataset (training on a smaller dataset)
    sent_df_sample = sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==0)].sample(n=200000, random_state=SentConfig.SEED)
    sent_df_sample = sent_df_sample.append(sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==4)].sample(n=200000, random_state=SentConfig.SEED))

    # split dataset into train, test, validation set
    train, test = train_test_split(sent_df_sample, test_size=0.1)
    train, val = train_test_split(train, test_size=0.05)

    # create necessary dataloaders, for advantage of batching by pytorch
    train_dl = SentimentDL(train)
    val_dl = SentimentDL(val)
    test_dl = SentimentDL(test)

    train_loader = DataLoader(train_dl, batch_size=SentConfig.TRAIN_BATCH_SIZE, shuffle=True)
    validation_loader = DataLoader(val_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True)

    # select the cuda device if available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # create model object
    model = SentimentModel()
    model.to(device)

    # ready with optimizer and scheduler objects 

    # do not apply weight decay in AdamW  to, bias layer and normalization terms
    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']  # taken from https://huggingface.co/transformers/training.html 
    # more named parameteres in model.named_parameters()
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    # optim = AdamW(model.parameters(), lr=5e-5)
    optim = AdamW(optimizer_grouped_parameters, lr=5e-5)

    # learning rate scheduling
    num_train_steps = int((train_dl.__len__()/SentConfig.TRAIN_BATCH_SIZE)*SentConfig.EPOCHS)
    num_warmup_steps = int(0.05*num_train_steps)
    scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps, num_train_steps)

    # Training : done on the basis of attaining better F1 score on the validation dataset
    
    scores = []
    min_f1 = 0
    
    for epoch in range(SentConfig.EPOCHS):
        _ = train_function(train_loader, model, optim, scheduler, device)
        _, results = evaluation_function(validation_loader, model, device)
        
        validation_f1 = round(f1_score(results[:,1], results[:,0]),4)
        accuracy = round(accuracy_score(results[:,1], results[:,0]), 4)
        
        scores.append((validation_f1, accuracy))
        print('epoch num: ', epoch, 'f1 score: ',validation_f1 , 'accuracy: ', accuracy)
        if validation_f1 > min_f1:
            # save  model if validation f1 score is 
            torch.save(model.state_dict(), "SentimentModel.bin")
            # update max loss
            min_f1 =  validation_f1

    # plotting scores

    scores = np.array(scores)
    fig, ax = plt.subplots(1, 2, figsize=(14,6))
    ax[0].plot(range(SentConfig.EPOCHS), scores[:,0], 'r')
    ax[1].plot(range(SentConfig.EPOCHS), scores[:,1])
    ax[0].set(xlabel='Epoch num', ylabel='F1 Score')
    ax[1].set(xlabel='Epoch num', ylabel='Accuracy')
    ax[0].set_title('validation set f1 score at each epoch')
    ax[1].set_title('validation set accuracy at each apoch')

    # F1 score calculation on test predictions

    state_dict_ = torch.load('SentimentModel.bin')
    model = SentimentModel()
    model.load_state_dict(state_dict_)
    model.to(device)
    
    _, results = evaluation_function(test_loader, model, device, inference=True)
    print(classification_report(results[:,1], results[:,0]))
    print(round(accuracy_score(results[:,1], results[:,0]),4))