コード例 #1
0
def main(params):
    # build dataset
    train_data = pd.read_csv('./data/train_final.csv')
    tokenizer = get_tokenizer('spacy', language='en')

    if params.emb_type == "GloVe":
        embedding = GloVe(
            name=params.emb_data, dim=params.emb_dim
        )  # use glove embedding with default option(name='840B', dim=300)
    elif params.emb_type == "CharNGram":
        embedding = CharNGram()
    elif params.emb_type == "FastText":
        embedding = FastText(name=params.emb_data, dim=params.emb_dim)
    else:
        print("Wrong embedding type")
        exit()

    train_data, val_data = train_data[1000:], train_data[:1000]
    train_dataset = SentimentDataset(train_data, tokenizer, embedding)
    val_dataset = SentimentDataset(val_data, tokenizer, embedding)

    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=batch_size,
                                shuffle=False)

    model = SentimentClassificationModel(params.emb_dim, params.hidden_dim,
                                         params.dropout).to(device)
    crit = nn.CrossEntropyLoss().to(device)
    optim = torch.optim.Adam(params=model.parameters(), lr=1e-3)

    best_val_acc = 0
    early_stop_cnt = 0
    epoch = 0
    train_loss_list = []
    train_acc_list = []
    val_acc_list = []
    while early_stop_cnt != 5:
        loss_list, train_acc = train.trainer(epoch, model, train_dataloader,
                                             crit, optim, device)
        val_acc = train.eval(epoch, model, val_dataloader, device, False)
        if val_acc > best_val_acc and epoch > 0:
            torch.save(model.state_dict(), './model/lstm_best.pt')
            best_val_acc = val_acc
            early_stop_cnt = 0

        early_stop_cnt += 1
        epoch += 1
        train_loss_list.extend(loss_list)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)

    print("Early stopping condition satisfied")
    plotting("train_loss", "steps", "loss", train_loss_list)
    plotting("train_accuracy", "epoch", "accuracy", train_acc_list)
    plotting('validation_accuracy', "epoch", "accuracy", val_acc_list)
コード例 #2
0
ファイル: prepro.py プロジェクト: yuvalofek/NLP
 def decode(self, dataset):
     encoded = list()
     for i in range(len(dataset)):
         item = dataset.getitem(i)
         encoding = list()
         for word in item[1]:
             encoding.append(self.enc2vocab.get(word, 'NAN'))
         encoded.append(list([item[0], ' '.join(encoding).strip()]))
     return SentimentDataset(data=encoded, data_from_file=False)
コード例 #3
0
ファイル: prepro.py プロジェクト: yuvalofek/NLP
 def encode(self, dataset):
     encoded = list()
     for i in range(len(dataset)):
         item = dataset.getitem(i)
         encoding = list()
         for word in item[1].split(' '):
             encoding.append(self.vocab2enc.get(word, self.max_vocab + 2))
         encoded.append(list([item[0], encoding]))
     return SentimentDataset(data=encoded, data_from_file=False)
コード例 #4
0
ファイル: prepro.py プロジェクト: yuvalofek/NLP
 def pad(self, dataset):
     for i in range(len(dataset)):
         item = dataset.getitem(i)
         if len(item[1]) > self.max_len:
             self.max_len = len(item[1])
     padded_data = list()
     for i in range(len(dataset)):
         item = dataset.getitem(i)
         padded_data.append([
             item[0],
             item[1].extend([0 for _ in range(self.max_len - len(item[1]))])
         ])
     return SentimentDataset(data=padded_data, data_from_file=False)
コード例 #5
0
def test(params):
    tokenizer = get_tokenizer('spacy', language='en')
    embedding = GloVe(name=params.emb_data, dim=params.emb_dim)

    test_data = pd.read_csv('./data/eval_final_open.csv')
    test_dataset = SentimentDataset(test_data, tokenizer, embedding, False)
    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=batch_size,
                                 shuffle=False)

    model = SentimentClassificationModel(params.emb_dim, params.hidden_dim,
                                         0.3).to(device)
    model.load_state_dict(torch.load('./model/lstm_best.pt'))

    inference = {'Id': [i for i in range(len(test_data))]}
    inference['Category'] = train.eval(0, model, test_dataloader, device, True)

    df = pd.DataFrame(inference)
    df.to_csv("./data/out.csv", index=False)
コード例 #6
0
ファイル: train.py プロジェクト: yuvalofek/NLP
                        help='file path for saved model')
    parser.add_argument('--prepro_save_path',
                        type=str,
                        default='./prepro_vocab.json',
                        help='file path for saved preprocessor')
    return parser.parse_args()


if __name__ == '__main__':
    # Get arguments
    print('Getting arguments...')
    args = get_args()

    # make a dataset
    print('Importing dataset...')
    data = SentimentDataset(data=args.train_path)

    # preprocess and save word encodings
    preprocessor = Preprocessor(max_vocab=args.max_vocab)
    data = preprocessor.fit_transform(dataset=data)
    preprocessor.save(args.prepro_save_path)

    # validation split
    data.split_data(validation_count=args.validation_count)
    train_ds, val_ds = data.to_dataset()

    # to dataLoaders
    train_set = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
    val_set = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False)

    print('Initializing model...')
コード例 #7
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_path', type=str, default='./train.csv', help='training file path')
    parser.add_argument('--max_vocab', type=int, default=5_000, help='maximum vocab size')
    parser.add_argument('--model_path', type=str, default='./trained_model.pkl', help='path to trained model')
    parser.add_argument('--prepro_path', type=str, default='./prepro_vocab.json', help='path to fit preprocessor')
    return parser.parse_args()


if __name__ == '__main__':
    # Get arguments
    print('Getting arguments...')
    args = get_args()

    # make a dataset
    print('Importing dataset...')
    data = SentimentDataset(data=args.test_path)

    # preprocess and save word encodings

    preprocessor = Preprocessor(max_vocab=args.max_vocab)
    preprocessor.load()
    data = preprocessor.transform(dataset=data)

    # validation split
    test_ds, _ = data.to_dataset()

    # to dataLoaders
    test_set = DataLoader(test_ds, batch_size=16, shuffle=False)

    # load saved model
    print('Loading trained model...')
コード例 #8
0
ファイル: baseline.py プロジェクト: yuvalofek/NLP
import argparse
import numpy as np

from data import SentimentDataset


def get_args():
    """
    Parse flags
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_path', type=str, default='./train.csv', help='training file path')
    return parser.parse_args()


if __name__ == '__main__':
    # Get arguments
    print('Getting arguments...')
    args = get_args()

    # make a dataset
    print('Importing dataset...')
    data = SentimentDataset(data=args.test_path)

    labels = [item[0] for item in data.data]
    print(f'Baseline Accuracy: {np.round(np.mean(labels), 4)*100}%')
コード例 #9
0
    pdb.set_trace()
    print("loading dataset")
    if opt.dataset == "imagenet32":
        train_dataset = Imagenet32Dataset(train=not opt.train_on_val,
                                          max_size=1 if opt.debug else -1)
        val_dataset = Imagenet32Dataset(train=0,
                                        max_size=1 if opt.debug else -1)
    elif opt.dataset == "cifar10":
        assert opt.dataset == "cifar10"
        train_dataset = CIFAR10Dataset(train=not opt.train_on_val,
                                       max_size=1 if opt.debug else -1)
        val_dataset = CIFAR10Dataset(train=0, max_size=1 if opt.debug else -1)
    else:
        assert opt.dataset == "sentiment"
        train_dataset = SentimentDataset(train=not opt.train_on_val,
                                         max_size=1 if opt.debug else -1)
        val_dataset = SentimentDataset(train=0,
                                       max_size=1 if opt.debug else -1)

    print("creating dataloaders")
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=opt.batch_size,
        shuffle=True,
    )
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=opt.batch_size,
        shuffle=True,
    )