def save_imdb_to_tsv(): TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) LABEL = data.Field(sequential=False, unk_token=False) train, val = IMDB.splits(TEXT, LABEL) str2label = { 'negative': '0', 'positive': '1', } test = val.examples dev = train.examples[-len(test):] train = train.examples[:-len(test)] def save_to_tsv(examples, fname): with open(fname, 'w') as f: f.write('sentence\tlabel\n') for e in examples: t, l = e.text, e.label t = ' '.join(t) l = str2label[l] f.write(f'{t}\t{l}\n') # tsv format tsv_dir = 'data/imdb/fine-tune' save_to_tsv(train, os.path.join(tsv_dir, 'train.tsv')) save_to_tsv(dev, os.path.join(tsv_dir, 'dev.tsv')) save_to_tsv(test, os.path.join(tsv_dir, 'test.tsv'))
def imdb(embedding=None): # make splits for data train, test = IMDB.splits(TEXT, LABEL) train, valid = train.split(random_state=random.seed(SEED)) TEXT.build_vocab( train, vectors=embedding, specials=["<pad>", "<null>"], unk_init=torch.Tensor.normal_, max_size=25000, ) TEXT.null_token = "<null>" # Need to build the vocab for the labels because they are `pos` and `neg` # This will convert them to numerical values LABEL.build_vocab(train) # make iterator for splits train_iter, valid_iter, test_iter = BucketIterator.splits( (train, valid, test), batch_size=64, sort_within_batch=True, ) return train_iter, valid_iter, test_iter
def test(config): device = 'cuda' if config['cuda'] else 'cpu' model = TextCNN.load(config['model_path']).to(device) with open(f"{config['text_vocab']}", "rb") as f: TEXT = dill.load(f) with open(f"{config['label_vocab']}", "rb") as f: LABEL = dill.load(f) _, test_data = IMDB.splits(TEXT, LABEL, root=config['data_path']) test_iter = torchtext.data.Iterator(test_data, batch_size=config['batch_size'], device=device) loss_fn = nn.CrossEntropyLoss( weight=torch.tensor(config['class_weight'], device=device)) val_loss, accuracy = evaluate(model, test_iter, loss_fn) print(f"val_loss:{val_loss} - accuracy:{accuracy}")
def full_split(cls, root_dir, val_size=1000, load_processed=True, save_processed=True): '''Generates the full train/val/test split''' spd = os.path.join(root_dir, 'imdb', 'processed/') train_path = os.path.join(spd, 'train.pkl') val_path = os.path.join(spd, 'val.pkl') test_path = os.path.join(spd, 'test.pkl') if (load_processed and os.path.exists(train_path) and os.path.exists(val_path) and os.path.exists(test_path)): print(" [*] Loading pre-processed IMDB objects.") with open(train_path, 'rb') as train_f, open(val_path, 'rb') as val_f, open( test_path, 'rb') as test_f: return pickle.load(train_f), pickle.load(val_f), pickle.load( test_f) # This means we're not loading from pickle itrain, itest = IMDB.splits(RawField(), RawField(), root=root_dir) vocab = Vocabulary([x.text for x in itrain] + [x.text for x in itest], f_min=100) # For val we take middle val_size values as this is where pos/neg switch occurs mid = len(itrain) // 2 grab = val_size // 2 train = cls([[x.text, x.label] for x in itrain[:mid - grab]] + [[x.text, x.label] for x in itrain[mid + grab:]], vocab) val = cls([[x.text, x.label] for x in itrain[mid - grab:mid + grab]], vocab) test = cls([[x.text, x.label] for x in itest], vocab) if save_processed: if not os.path.exists(spd): os.makedirs(spd) with open(train_path, 'wb') as f: pickle.dump(train, f) with open(val_path, 'wb') as f: pickle.dump(val, f) with open(test_path, 'wb') as f: pickle.dump(test, f) return train, val, test
def prepare_data(self): self.text_field = Field(sequential=True, fix_length=200, include_lengths=True) self.label_field = LabelField() train_val, test = IMDB.splits(self.text_field, self.label_field) random.seed(42) train, val = train_val.split(random_state=random.getstate()) self.text_field.build_vocab( train, vectors=GloVe()) #vectors=FastText('simple')) self.label_field.build_vocab(train) self.train_iter, self.test_iter, self.val_iter = BucketIterator.splits( (train, test, val), batch_size=self.batch_size) self.train_iter.sort_within_batch = True self.val_iter.sort_within_batch = True
def process_sents(): def insert_index(dataset: data.Dataset): examples = dataset.examples fields = dataset.fields for i, e in enumerate(examples): setattr(e, 'index', i) fields['index'] = data.Field(sequential=False, use_vocab=False) dataset.examples = examples dataset.fields = fields return dataset text = data.Field(lower=True, include_lengths=True) label = data.Field(sequential=False, is_target=True, use_vocab=False) train_data, test_data = IMDB.splits(text, label) train_data = insert_index(train_data) test_data = insert_index(test_data) # save data torch.save(train_data.examples, 'data/imdb/train.data') torch.save(test_data.examples, 'data/imdb/test.data') torch.save(train_data.fields, 'data/imdb/fields')
def load(self, split_ratio=None, random_state=None, verbose=False): if split_ratio is None: split_ratio = self.split_ratio assert (split_ratio <= 1) if random_state is None: random_state = self.random_state ## create field - tokenize text & create label classes self.TEXT = data.Field(tokenize='spacy') self.LABEL = data.LabelField(dtype=torch.float) # load dataset self.train_data, self.test_data = IMDB.splits(self.TEXT, self.LABEL) # split training into train & validation self.train_data, self.valid_data = self.train_data.split( split_ratio=split_ratio, random_state=random_state) if verbose: print('Training data size: ', len(self.train_data)) print('Validation data size: ', len(self.valid_data)) print('Test data size: ', len(self.test_data))
def get_data_loader( doc_processor:DocumentDataPreprocessor,\ batch_size=3,\ dataset_path ='data/IMDB/aclImdb/train', MAX_WORD_COUNT=1000,\ MIN_DOC_THRESHOLD=300,\ MIN_WORD_COUNT=0, num_samples=None ): text_preprocessing = None #lambda x:mdl.model_processor(x) label_preprocessing = None # lambda x:1 if 'pos' else 0 TEXT = torchtext.data.RawField(preprocessing=text_preprocessing) LABEL = torchtext.data.RawField(is_target=True, preprocessing=label_preprocessing) dataset = IMDB(dataset_path, text_field=TEXT, label_field=LABEL) data_objects = [{ 'text': i.text, 'label': i.label } for i in dataset.examples] df = pandas.DataFrame(data_objects) df['training_content'] = df.apply( lambda row: doc_processor.formatter(row['text']), axis=1) df = df[df['training_content'].str.split().str.len() <= MAX_WORD_COUNT] # Filtering post cleanup. df = df[df['training_content'].str.split().str.len() >= MIN_WORD_COUNT] if num_samples is not None: df = df.sample(n=num_samples) labels = df['label'] training_content_df = df['training_content'] tensor_dataset, column_split_order = doc_processor.prepare_dataset( training_content_df, labels, max_length=1024) dataloader = DataLoader( tensor_dataset, # The training samples. sampler=RandomSampler(tensor_dataset), # Select batches randomly batch_size=batch_size # Trains with this batch size. ) return dataloader, column_split_order
from torchtext.vocab import GloVe,FastText,CharNGram import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import torch from torchtext.datasets import IMDB import sys import time from apex import amp is_cuda = torch.cuda.is_available() TEXT = data.Field(lower=True, fix_length=200, batch_first=False) LABEL = data.Field(sequential=False,) train, test = IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300), max_size=10000, min_freq=10) LABEL.build_vocab(train,) train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=64) train_iter.repeat = False test_iter.repeat = False class IMDBrnn(nn.Module): def __init__(self, vocab, hidden_size, n_cat, bs=1, nl=2): super().__init__() self.hidden_size = hidden_size self.bs = bs
output = F.softmax(output, dim=-1) print(output) if __name__ == "__main__": text_field = Field(use_vocab=False, tokenize=tokenize_and_trunc, preprocessing=tokenizer.convert_tokens_to_ids, batch_first=True, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) label_field = LabelField() train_data, test_data = IMDB.splits(text_field, label_field) train_data, valid_data = train_data.split() label_field.build_vocab(train_data) n_epochs = 5 batch_size = 128 rnn_hidden_size = 256 dropout_p = 0.2 num_classes = len(label_field.vocab) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' model = BertGRU(bert.config.to_dict()['dim'], rnn_hidden_size, num_classes=num_classes, dropout_p=dropout_p) for name, params in model.named_parameters():
from torchtext.datasets import IMDB from os.path import join, exists from os import mkdir from tqdm import tqdm import pandas as pd import torch import torch.nn as nn print("loading dataset...") train_iter, test_iter = IMDB("datasets", split=('train', 'test')) # tokenize def tokenize(text): return [t.lower() for t in text.split()] train_set = [(label, tokenize(line)) for label, line in tqdm(train_iter, desc="tokenizing trainset...")] test_set = [(label, tokenize(line)) for label, line in tqdm(test_iter, desc="tokenizing testset...")] # vocab vocab = sorted(list(set(t for (_, tokens) in train_set for t in tokens))) PADDING_IDX = 0 vocab.insert(PADDING_IDX, "<padding>") UNKNOWN_IDX = 1 vocab.insert(UNKNOWN_IDX, "<unknown>") token2idx = {token: idx for idx, token in enumerate(vocab)}
def main(): args = parse_arguments() use_cuda = torch.cuda.is_available() # visdom for plotting vis = Visdom() win_g, win_d, win_w = None, None, None assert vis.check_connection() # load datasets print("[!] preparing dataset...") TEXT = Field(lower=True, fix_length=args.seq_len, tokenize=list, batch_first=True) LABEL = Field(sequential=False) train_data, test_data = IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train_data) LABEL.build_vocab(train_data) train_iter, test_iter = BucketIterator.splits( (train_data, test_data), batch_size=args.batch_size, repeat=True) vocab_size = len(TEXT.vocab) print("[TRAIN]:%d (dataset:%d)\t[TEST]:%d (dataset:%d)\t[VOCAB]:%d" % (len(train_iter), len(train_iter.dataset), len(test_iter), len(test_iter.dataset), vocab_size)) # instantiate models G = Generator(dim=512, seq_len=args.seq_len, vocab_size=vocab_size) D = Discriminator(dim=512, seq_len=args.seq_len, vocab_size=vocab_size) optim_G = optim.Adam(G.parameters(), lr=args.lr, betas=(0.5, 0.9)) optim_D = optim.Adam(D.parameters(), lr=args.lr, betas=(0.5, 0.9)) global one, mone one = torch.FloatTensor([1]) mone = one * -1 if use_cuda: G, D = G.cuda(), D.cuda() one, mone = one.cuda(), mone.cuda() train_iter = iter(train_iter) batch_size = args.batch_size for b in range(1, args.batchs+1): # (1) Update D network for p in D.parameters(): # reset requires_grad p.requires_grad = True for iter_d in range(args.critic_iters): # CRITIC_ITERS batch = next(train_iter) text, label = batch.text, batch.label text = to_onehot(text, vocab_size) if use_cuda: text = text.cuda() real = Variable(text) d_loss, wasserstein = train_discriminator( D, G, optim_D, real, args.lamb, batch_size, use_cuda) # (2) Update G network for p in D.parameters(): p.requires_grad = False # to avoid computation g_loss = train_generator(D, G, optim_G, batch_size, use_cuda) # plot losses on visdom win_d = plot('Discriminator Loss', vis, x=b, y=d_loss.data[0], win=win_d) win_g = plot('Generator Loss', vis, x=b, y=g_loss.data[0], win=win_g) win_w = plot('Wasserstein Distance', vis, x=b, y=wasserstein.data[0], win=win_w) if b % 500 == 0 and b > 1: samples = sample(G, TEXT, 1, args.seq_len, vocab_size, use_cuda) print("[%d] D:%5.2f G:%5.2f W:%5.2f \nsample:%s \t [%d]" % (b, d_loss.data[0], g_loss.data[0], wasserstein.data[0], samples[0], label.data[0])) log_sample("Sample %d" % b, vis, samples) if b % 5000 == 0 and b > 1: print("[!] saving model") if not os.path.isdir(".save"): os.makedirs(".save") torch.save(G.state_dict(), './.save/wgan_g_%d.pt' % (b)) torch.save(D.state_dict(), './.save/wgan_d_%d.pt' % (b))
from nntoolbox.sequence.utils import extract_last from nntoolbox.components import MLP, ConcatPool from functools import partial MAX_VOCAB_SIZE = 25000 BATCH_SIZE = 16 TEXT = data.Field(tokenize='spacy', include_lengths=True, fix_length=500) LABEL = data.LabelField(dtype=torch.float) # train_data, val_data, test_data = SST.splits( # text_field=TEXT, # label_field=LABEL # ) train_val_data, test_data = IMDB.splits(TEXT, LABEL) train_data, val_data = train_val_data.split(split_ratio=0.8) train_iterator, val_iterator, test_iterator = data.BucketIterator.splits( (train_data, val_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, device=get_device() ) TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d") LABEL.build_vocab(train_data) # max_length = 0 # for batch in train_iterator: # texts, text_lengths = batch.text
def train(config): try: split = config["split"] data_path = config["data_path"] pretrained_model_dir = config["pretrained_model_dir"] pretrained_model_file = config["pretrained_model_file"] last_model_path = config["last_model_path"] save_to = config["save_to"] min_freq = config["min_freq"] batch_size = config["batch_size"] max_sent_length = config["max_sent_length"] embed_dim = config["embed_dim"] filter_num = config["filter_num"] filter_widths = config["filter_widths"] learning_rate = config["learning_rate"] patience = config["patience"] lr_decay = config["lr_decay"] max_num_trial = config["max_num_trial"] max_epoch = config["max_epoch"] save_every = config["save_every"] cuda = config["cuda"] debug = config["debug"] except KeyError: print("Input Parameter Error") exit(1) if not Path(save_to).exists(): Path(save_to).mkdir() device = torch.device("cuda:0" if ( torch.cuda.is_available() and cuda) else "cpu") # build torchtext field TEXT = torchtext.data.Field(tokenize='spacy', lower=True) LABEL = torchtext.data.Field(dtype=torch.long) train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path) if debug: train_data, val_data = train_data.split(split_ratio=0.1) train_data, val_data = train_data.split(split_ratio=0.7) train_iter, val_iter = torchtext.data.Iterator.splits( (train_data, val_data), batch_size=batch_size, device=device) if (pretrained_model_file is not None) and (pretrained_model_dir is not None): pretrained_vector = Vectors(name=pretrained_model_file, cache=pretrained_model_dir) TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector) LABEL.build_vocab(train_data) logging.info("saving TEXT/LABEL vocabulary...") with open(f"{save_to}/TEXT_vocab.bin", "wb") as f: dill.dump(TEXT, f) with open(f"{save_to}/LABEL_vocab.bin", "wb") as f: dill.dump(LABEL, f) assert embed_dim == TEXT.vocab.vectors.shape[ -1], "incompatiable embeddings" embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab) model = TextCNN(embed_num, embed_dim, class_num, filter_num, filter_widths, from_pretrained=TEXT.vocab.vectors).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor( [0, 0, 1.0, 1.0], device=device)) # class [<unk>,<pad>,'pos','neg'] if last_model_path is not None: # load model logging.info(f'load model from {last_model_path}') params = torch.load(last_model_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) logging.info('restore parameters of the optimizers') optimizer.load_state_dict(torch.load(last_model_path + '.optim')) model.train() epoch = 0 cur_trial = 0 hist_valid_scores = [] train_time = begin_time = time.time() logging.info("begin training!") while True: epoch += 1 train_loss = 0 cum_cnt = 0 step = 0 for batch in iter(train_iter): feature, target = batch.text.T, batch.label.squeeze(0) step += 1 optimizer.zero_grad() res = model(feature) loss = cross_entropy(res, target) train_loss += loss loss.backward() optimizer.step() train_loss = train_loss / step val_loss, accuracy = evaluate(model, val_iter, cross_entropy) logging.info( f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy} speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s' ) train_time = time.time() is_better = len( hist_valid_scores) == 0 or val_loss < min(hist_valid_scores) hist_valid_scores.append(val_loss) if epoch % save_every == 0: model.save(f"{save_to}/model_step_{epoch}") torch.save(optimizer.state_dict(), f"{save_to}/model_step_{epoch}.optim") if is_better: cur_patience = 0 model_save_path = f"{save_to}/model_best" print(f'save currently the best model to [{model_save_path}]') model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif cur_patience < patience: cur_patience += 1 print('hit patience %d' % cur_patience) if cur_patience == patience: cur_trial += 1 print(f'hit #{cur_trial} trial') if cur_trial == max_num_trial: print('early stop!') exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * lr_decay logging.info( f'load previously best model and decay learning rate to {lr}' ) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) logging.info('restore parameters of the optimizers') optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience cur_patience = 0 if epoch == max_epoch: print('reached maximum number of epochs!') exit(0)