Exemple #1
0
def main():
    Config = config.get_args()
    set_seed(Config.seed)
    word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path)

    test_data = CommentDataSet(Config.test_path, word2ix, ix2word)
    test_loader = DataLoader(
        test_data,
        batch_size=16,
        shuffle=False,
        num_workers=0,
        collate_fn=mycollate_fn,
    )

    weight = torch.zeros(len(word2ix), Config.embedding_dim)

    model = SentimentModel(embedding_dim=Config.embedding_dim,
                           hidden_dim=Config.hidden_dim,
                           LSTM_layers=Config.LSTM_layers,
                           drop_prob=Config.drop_prob,
                           pre_weight=weight)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")
    criterion = nn.CrossEntropyLoss()
    model.load_state_dict(torch.load(Config.model_save_path),
                          strict=True)  # 模型加载

    confuse_meter = ConfuseMeter()
    confuse_meter = test(test_loader, device, model, criterion)
Exemple #2
0
def main():
    Config = config.get_args()
    set_seed(Config.seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    word2ix, ix2word, max_len, avg_len = build_word_dict(Config.train_path)
    weight = torch.zeros(len(word2ix), Config.embedding_dim)
    model = SentimentModel(embedding_dim=Config.embedding_dim,
                           hidden_dim=Config.hidden_dim,
                           LSTM_layers=Config.LSTM_layers,
                           drop_prob=Config.drop_prob,
                           pre_weight=weight)
    model.load_state_dict(torch.load(Config.model_save_path),
                          strict=True)  # 模型加载

    # comment_str = "忘不掉的一句台词,是杜邦公司笑着对男主说:“Sue me”。我记得前段时间某件事,也是同样的说辞,“欢迎来起诉中华有为”。也是同样的跋扈。若干年后,会看到改编的电影吗。"

    result = predict(Config.comment_str, model, device, word2ix)
    print(Config.comment_str, result)
Exemple #3
0
            loss = criterion(predictions, batch.label.float())
            
            acc = binary_accuracy(predictions, batch.label)
        
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator),  epoch_acc / len(iterator)

N_epoches = 5

best_valid_loss = float('inf')

for epoch in range(N_epoches):
        
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'Sentiment-model.pt')
        
    print(f'Epoch:  {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain  Loss: {train_loss: .3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tValid  Loss: {valid_loss: .3f} | Valid Acc: {valid_acc*100:.2f}%')

model.load_state_dict(torch.load('Sentiment-model.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
def predict():
    # kfold type of data input
    data = pd.read_csv(config.TEST_FILE)
    data['Label_encoded'] = 0
    data['Sentiment_encoded'] = 0
    df_test = data

    test_data = CommentData(comments=df_test['Comment'],
                            labels=df_test['Label_encoded'],
                            sentiments=df_test['Sentiment_encoded'])

    test_dataloader = torch.utils.data.DataLoader(
        test_data,
        batch_size=config.TEST_BATCH_SIZE,
        # num_workers = 4
    )

    # model
    device = torch.device('cuda')

    model_config = BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True

    model0 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model0.to(device)
    # model0 = nn.DataParallel(model0)
    model0.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_0.bin'))
    model0.eval()

    model1 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model1.to(device)
    # model1 = nn.DataParallel(model1)
    model1.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_1.bin'))
    model1.eval()

    model2 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model2.to(device)
    # model2 = nn.DataParallel(model2)
    model2.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_2.bin'))
    model2.eval()

    model3 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model3.to(device)
    # model3 = nn.DataParallel(model3)
    model3.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_3.bin'))
    model3.eval()

    model4 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT)
    model4.to(device)
    # model4 = nn.DataParallel(model4)
    model4.load_state_dict(torch.load(config.SAVED_MODEL_PATH +
                                      '/model_4.bin'))
    model4.eval()

    # process raw output
    model_prediction = []

    with torch.no_grad():
        tq0 = tqdm(test_dataloader, total=len(test_dataloader))
        for bi, data in tqdm(enumerate(tq0)):
            # load data / ready to input
            input_ids = data['input_ids']
            token_type_ids = data['token_type_ids']
            attention_mask = data['attention_mask']

            label = data['label']
            sentiment = data['sentiment']

            # prepare input data
            input_ids = input_ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            attention_mask = attention_mask.to(device, dtype=torch.long)

            label = label.to(device, dtype=torch.long)
            sentiment = sentiment.to(device, dtype=torch.long)

            # forward(self, ids, mask, type_ids)

            out0 = model0(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out1 = model1(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out2 = model2(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out3 = model3(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out4 = model4(ids=input_ids,
                          mask=attention_mask,
                          type_ids=token_type_ids)

            out = (out0 + out1 + out2 + out3 + out4) / 5
            out = torch.softmax(out, dim=1).cpu().detach().numpy()

            for ix, result in enumerate(out):
                pred = np.argmax(result)
                model_prediction.append(pred)

    sample = pd.read_csv(config.TEST_FILE)
    sample['sentiment_pred'] = model_prediction
    sample.to_csv(config.OUTPUT_PATH + '/pred_sentiment.csv', index=False)
def preprocess_and_train():
    # read dataset
    data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
    data.columns = ('target','uid', 'time', 'query', 'user', 'text')

    # create new dataframe
    sent_df = pd.DataFrame(None, columns=('target', 'text'))
    sent_df['target'] = data['target']
    sent_df['text'] = data['text'].apply(preprocess_text)
    sent_df['tweet_size'] = data['text'].apply(lambda x:len(x.split()))

    # select random sample of 400,000 tweets from total dataset (training on a smaller dataset)
    sent_df_sample = sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==0)].sample(n=200000, random_state=SentConfig.SEED)
    sent_df_sample = sent_df_sample.append(sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==4)].sample(n=200000, random_state=SentConfig.SEED))

    # split dataset into train, test, validation set
    train, test = train_test_split(sent_df_sample, test_size=0.1)
    train, val = train_test_split(train, test_size=0.05)

    # create necessary dataloaders, for advantage of batching by pytorch
    train_dl = SentimentDL(train)
    val_dl = SentimentDL(val)
    test_dl = SentimentDL(test)

    train_loader = DataLoader(train_dl, batch_size=SentConfig.TRAIN_BATCH_SIZE, shuffle=True)
    validation_loader = DataLoader(val_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True)

    # select the cuda device if available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # create model object
    model = SentimentModel()
    model.to(device)

    # ready with optimizer and scheduler objects 

    # do not apply weight decay in AdamW  to, bias layer and normalization terms
    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']  # taken from https://huggingface.co/transformers/training.html 
    # more named parameteres in model.named_parameters()
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    # optim = AdamW(model.parameters(), lr=5e-5)
    optim = AdamW(optimizer_grouped_parameters, lr=5e-5)

    # learning rate scheduling
    num_train_steps = int((train_dl.__len__()/SentConfig.TRAIN_BATCH_SIZE)*SentConfig.EPOCHS)
    num_warmup_steps = int(0.05*num_train_steps)
    scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps, num_train_steps)

    # Training : done on the basis of attaining better F1 score on the validation dataset
    
    scores = []
    min_f1 = 0
    
    for epoch in range(SentConfig.EPOCHS):
        _ = train_function(train_loader, model, optim, scheduler, device)
        _, results = evaluation_function(validation_loader, model, device)
        
        validation_f1 = round(f1_score(results[:,1], results[:,0]),4)
        accuracy = round(accuracy_score(results[:,1], results[:,0]), 4)
        
        scores.append((validation_f1, accuracy))
        print('epoch num: ', epoch, 'f1 score: ',validation_f1 , 'accuracy: ', accuracy)
        if validation_f1 > min_f1:
            # save  model if validation f1 score is 
            torch.save(model.state_dict(), "SentimentModel.bin")
            # update max loss
            min_f1 =  validation_f1

    # plotting scores

    scores = np.array(scores)
    fig, ax = plt.subplots(1, 2, figsize=(14,6))
    ax[0].plot(range(SentConfig.EPOCHS), scores[:,0], 'r')
    ax[1].plot(range(SentConfig.EPOCHS), scores[:,1])
    ax[0].set(xlabel='Epoch num', ylabel='F1 Score')
    ax[1].set(xlabel='Epoch num', ylabel='Accuracy')
    ax[0].set_title('validation set f1 score at each epoch')
    ax[1].set_title('validation set accuracy at each apoch')

    # F1 score calculation on test predictions

    state_dict_ = torch.load('SentimentModel.bin')
    model = SentimentModel()
    model.load_state_dict(state_dict_)
    model.to(device)
    
    _, results = evaluation_function(test_loader, model, device, inference=True)
    print(classification_report(results[:,1], results[:,0]))
    print(round(accuracy_score(results[:,1], results[:,0]),4))