def main(params): # build dataset train_data = pd.read_csv('./data/train_final.csv') tokenizer = get_tokenizer('spacy', language='en') if params.emb_type == "GloVe": embedding = GloVe( name=params.emb_data, dim=params.emb_dim ) # use glove embedding with default option(name='840B', dim=300) elif params.emb_type == "CharNGram": embedding = CharNGram() elif params.emb_type == "FastText": embedding = FastText(name=params.emb_data, dim=params.emb_dim) else: print("Wrong embedding type") exit() train_data, val_data = train_data[1000:], train_data[:1000] train_dataset = SentimentDataset(train_data, tokenizer, embedding) val_dataset = SentimentDataset(val_data, tokenizer, embedding) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False) model = SentimentClassificationModel(params.emb_dim, params.hidden_dim, params.dropout).to(device) crit = nn.CrossEntropyLoss().to(device) optim = torch.optim.Adam(params=model.parameters(), lr=1e-3) best_val_acc = 0 early_stop_cnt = 0 epoch = 0 train_loss_list = [] train_acc_list = [] val_acc_list = [] while early_stop_cnt != 5: loss_list, train_acc = train.trainer(epoch, model, train_dataloader, crit, optim, device) val_acc = train.eval(epoch, model, val_dataloader, device, False) if val_acc > best_val_acc and epoch > 0: torch.save(model.state_dict(), './model/lstm_best.pt') best_val_acc = val_acc early_stop_cnt = 0 early_stop_cnt += 1 epoch += 1 train_loss_list.extend(loss_list) train_acc_list.append(train_acc) val_acc_list.append(val_acc) print("Early stopping condition satisfied") plotting("train_loss", "steps", "loss", train_loss_list) plotting("train_accuracy", "epoch", "accuracy", train_acc_list) plotting('validation_accuracy', "epoch", "accuracy", val_acc_list)
def decode(self, dataset): encoded = list() for i in range(len(dataset)): item = dataset.getitem(i) encoding = list() for word in item[1]: encoding.append(self.enc2vocab.get(word, 'NAN')) encoded.append(list([item[0], ' '.join(encoding).strip()])) return SentimentDataset(data=encoded, data_from_file=False)
def encode(self, dataset): encoded = list() for i in range(len(dataset)): item = dataset.getitem(i) encoding = list() for word in item[1].split(' '): encoding.append(self.vocab2enc.get(word, self.max_vocab + 2)) encoded.append(list([item[0], encoding])) return SentimentDataset(data=encoded, data_from_file=False)
def pad(self, dataset): for i in range(len(dataset)): item = dataset.getitem(i) if len(item[1]) > self.max_len: self.max_len = len(item[1]) padded_data = list() for i in range(len(dataset)): item = dataset.getitem(i) padded_data.append([ item[0], item[1].extend([0 for _ in range(self.max_len - len(item[1]))]) ]) return SentimentDataset(data=padded_data, data_from_file=False)
def test(params): tokenizer = get_tokenizer('spacy', language='en') embedding = GloVe(name=params.emb_data, dim=params.emb_dim) test_data = pd.read_csv('./data/eval_final_open.csv') test_dataset = SentimentDataset(test_data, tokenizer, embedding, False) test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) model = SentimentClassificationModel(params.emb_dim, params.hidden_dim, 0.3).to(device) model.load_state_dict(torch.load('./model/lstm_best.pt')) inference = {'Id': [i for i in range(len(test_data))]} inference['Category'] = train.eval(0, model, test_dataloader, device, True) df = pd.DataFrame(inference) df.to_csv("./data/out.csv", index=False)
help='file path for saved model') parser.add_argument('--prepro_save_path', type=str, default='./prepro_vocab.json', help='file path for saved preprocessor') return parser.parse_args() if __name__ == '__main__': # Get arguments print('Getting arguments...') args = get_args() # make a dataset print('Importing dataset...') data = SentimentDataset(data=args.train_path) # preprocess and save word encodings preprocessor = Preprocessor(max_vocab=args.max_vocab) data = preprocessor.fit_transform(dataset=data) preprocessor.save(args.prepro_save_path) # validation split data.split_data(validation_count=args.validation_count) train_ds, val_ds = data.to_dataset() # to dataLoaders train_set = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) val_set = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False) print('Initializing model...')
parser = argparse.ArgumentParser() parser.add_argument('--test_path', type=str, default='./train.csv', help='training file path') parser.add_argument('--max_vocab', type=int, default=5_000, help='maximum vocab size') parser.add_argument('--model_path', type=str, default='./trained_model.pkl', help='path to trained model') parser.add_argument('--prepro_path', type=str, default='./prepro_vocab.json', help='path to fit preprocessor') return parser.parse_args() if __name__ == '__main__': # Get arguments print('Getting arguments...') args = get_args() # make a dataset print('Importing dataset...') data = SentimentDataset(data=args.test_path) # preprocess and save word encodings preprocessor = Preprocessor(max_vocab=args.max_vocab) preprocessor.load() data = preprocessor.transform(dataset=data) # validation split test_ds, _ = data.to_dataset() # to dataLoaders test_set = DataLoader(test_ds, batch_size=16, shuffle=False) # load saved model print('Loading trained model...')
import argparse import numpy as np from data import SentimentDataset def get_args(): """ Parse flags """ parser = argparse.ArgumentParser() parser.add_argument('--test_path', type=str, default='./train.csv', help='training file path') return parser.parse_args() if __name__ == '__main__': # Get arguments print('Getting arguments...') args = get_args() # make a dataset print('Importing dataset...') data = SentimentDataset(data=args.test_path) labels = [item[0] for item in data.data] print(f'Baseline Accuracy: {np.round(np.mean(labels), 4)*100}%')
pdb.set_trace() print("loading dataset") if opt.dataset == "imagenet32": train_dataset = Imagenet32Dataset(train=not opt.train_on_val, max_size=1 if opt.debug else -1) val_dataset = Imagenet32Dataset(train=0, max_size=1 if opt.debug else -1) elif opt.dataset == "cifar10": assert opt.dataset == "cifar10" train_dataset = CIFAR10Dataset(train=not opt.train_on_val, max_size=1 if opt.debug else -1) val_dataset = CIFAR10Dataset(train=0, max_size=1 if opt.debug else -1) else: assert opt.dataset == "sentiment" train_dataset = SentimentDataset(train=not opt.train_on_val, max_size=1 if opt.debug else -1) val_dataset = SentimentDataset(train=0, max_size=1 if opt.debug else -1) print("creating dataloaders") train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=opt.batch_size, shuffle=True, ) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=opt.batch_size, shuffle=True, )