def get_dataset(fix_length=25, lower=False, vectors=None, train_dir='train.csv', batch_size=1, device=None): train = pd.read_csv(train_dir, error_bad_lines=False) train['text'] = train['text'].apply(lambda x: remove_unnecessary(x)) if vectors is not None: lower = True prepare_csv(train) TEXT = data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True, fix_length=fix_length) LABEL = data.Field(use_vocab=True, sequential=False, dtype=torch.float16) ID = data.Field(use_vocab=False, sequential=False, dtype=torch.float16) train_temp, val_temp = data.TabularDataset.splits( path='cache/', format='csv', skip_header=True, train='dataset_train.csv', validation='dataset_val.csv', fields=[('id', ID), ('target', LABEL), ('text', TEXT)]) TEXT.build_vocab(train_temp, val_temp, max_size=20000, min_freq=10, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train_temp) ID.build_vocab( train_temp, val_temp, ) word_embeddings = TEXT.vocab.vectors vocab_size = len(TEXT.vocab) train_iter = get_iterator(train_temp, batch_size=batch_size, train=True, shuffle=True, repeat=False, device=device) val_iter = get_iterator(val_temp, batch_size=batch_size, train=True, shuffle=True, repeat=False, device=device) print('Train samples:%d' % (len(train_temp)), 'Valid samples:%d' % (len(val_temp)), 'Train minibatch nb:%d' % (len(train_iter)), 'Valid minibatch nb:%d' % (len(val_iter))) return vocab_size, word_embeddings, train_iter, val_iter
word = re.sub(r'\n', ' ', word) newSample.append(word) return newSample def postprocessing(batch, vocab): """ Called after numericalising but before vectorising. """ return batch inputSize = 300 stopWords = {} wordVectors = GloVe(name='6B', dim=inputSize) ################################################################################ ####### The following determines the processing of label data (ratings) ######## ################################################################################ def convertNetOutput(ratingOutput, categoryOutput): ratingOutputNew = torch.argmax(ratingOutput, dim=1, keepdim=False) categoryOutputNew = torch.argmax(categoryOutput, dim=1, keepdim=False) return ratingOutputNew, categoryOutputNew ################################################################################
for dataset in (train, val, test): for example in dataset: example.text = [word.lower() for word in example.text] TEXT.build_vocab(train, val, test) LABEL.build_vocab(train, val, test) train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_size=10) # Build the vocabulary with word embeddings url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) glove = GloVe(name="6B", dim=300) glove.vectors = glove.vectors[torch.arange(len(TEXT.vocab) + 10)] idx = torch.arange(len(TEXT.vocab.vectors)) EMBEDDINGS = torch.cat((TEXT.vocab.vectors[idx], glove.vectors[idx]), dim=1) # genrating a mapping from bigrams to indexes all_bigrams = set() for dataset in (train, val, test): for example in dataset: idx = [TEXT.vocab.stoi[word] for word in example.text] all_bigrams |= set((i,) for i in idx) for i in range(len(idx) - 1): all_bigrams.add((idx[i], idx[i + 1]))
def train(param): if not isinstance(param, dict): args = vars(param) else: args = param # GPUs if args['gpu_index'] is not None: args['gpus'] = str(args['gpu_index']) # DATASET ########################################################## dp_valid_kwargs = inspect.signature(DataProvider.__init__).parameters dp_kwargs = dict( (name, args[name]) for name in dp_valid_kwargs if name in args) data_provider = DataProvider(**dp_kwargs) training_dataset, training_dataloader = data_provider.get_training_dataset_and_loader( ) validation_dataset, validation_dataloader = data_provider.get_validation_dataset_and_loader( ) ########################################################## # Set Seed if args['resume_from_checkpoint'] is None: if args['seed'] is not None: seed_everything(args['seed']) # MODEL ########################################################## # Check using pretraining pre_trained_word_embedding = args['pre_trained_word_embedding'] if pre_trained_word_embedding is None: pass elif pre_trained_word_embedding == 'glove.6B.100d': assert args['embedding_dim'] == 100 else: raise ModuleNotFoundError # # # get framework framework = get_class_by_name(args['model']) if args['spec_type'] != 'magnitude': args['input_channels'] = 4 # Model instantiation args['vocab_size'] = len(training_dataset.vocab) model = framework(**args) if pre_trained_word_embedding is None: pass elif pre_trained_word_embedding == 'glove.6B.100d': with torch.no_grad(): from torchtext.vocab import GloVe vocab = training_dataset.vocab glove = GloVe(name='6B', dim=100) for token in vocab: if token in glove.stoi.keys(): glove_i = glove.stoi[token] embedding_i = training_dataset.word_to_idx[token] model.spec2spec.embedding.weight[ embedding_i] = glove.vectors[glove_i] pass else: raise ModuleNotFoundError if args['last_activation'] != 'identity' and args[ 'spec_est_mode'] != 'masking': warn( 'Please check if you really want to use a mapping-based spectrogram estimation method ' 'with a final activation function. ') ########################################################## # -- checkpoint ckpt_path = Path(args['ckpt_root_path']) mkdir_if_not_exists(ckpt_path) ckpt_path = ckpt_path.joinpath(args['model']) mkdir_if_not_exists(ckpt_path) run_id = args['run_id'] ckpt_path = ckpt_path.joinpath(run_id) mkdir_if_not_exists(ckpt_path) save_top_k = args['save_top_k'] checkpoint_callback = ModelCheckpoint( filepath=ckpt_path, save_top_k=save_top_k, verbose=False, monitor='val_loss', save_last=False, save_weights_only=args['save_weights_only']) args['checkpoint_callback'] = checkpoint_callback # -- early stop patience = args['patience'] early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=False) args['early_stop_callback'] = early_stop_callback # -- logger setting log = args['log'] if log == 'False': args['logger'] = False elif log == 'wandb': args['logger'] = WandbLogger(project=args['task'], tags=args['model'], offline=False, id=run_id) args['logger'].log_hyperparams(model.hparams) args['logger'].watch(model, log='all') elif log == 'tensorboard': raise NotImplementedError else: args['logger'] = True # default default_save_path = 'etc/lightning_logs' mkdir_if_not_exists(default_save_path) valid_kwargs = inspect.signature(Trainer.__init__).parameters trainer_kwargs = dict( (name, args[name]) for name in valid_kwargs if name in args) # Trainer Definition # Trainer trainer = Trainer(**trainer_kwargs) for key in args.keys(): print('{}:{}'.format(key, args[key])) if args['auto_lr_find']: lr_find = trainer.tuner.lr_find(model, training_dataloader, validation_dataloader, early_stop_threshold=None, min_lr=1e-5) print(f"Found lr: {lr_find.suggestion()}") return 0 if args['resume_from_checkpoint'] is not None: 'resume' trainer.fit( model, training_dataloader, validation_dataloader, ) return None
"should've", "shouldn", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "wouldn", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would" ] DIMENSION = 100 wordVectors = GloVe(name='6B', dim=DIMENSION) ########################################################################### ##### The following determines the processing of label data (ratings) ##### ########################################################################### def convertLabel(datasetLabel): """ Labels (product ratings) from the dataset are provided to you as floats, taking the values 1.0, 2.0, 3.0, 4.0, or 5.0. You may wish to train with these as they are, or you you may wish to convert them to another representation in this function. Consider regression vs classification. """ # class1 = torch.Tensor([1., 0., 0., 0., 0.])
def _all_train(self, num_epochs, model_label=None): # Create all train dataset concat_train_datasets = self.train_datasets[0] for train_idx in range(1, self.split_num): concat_train_datasets += self.train_datasets[train_idx] all_examples = [example for example in concat_train_datasets] # Create field word = data.Field( include_lengths=True, batch_first=True, lower=True, preprocessing=data.Pipeline(lambda w: re.sub('\d', '0', w) if self.config.is_digit else w)) char_nesting = data.Field( tokenize=list, batch_first=True, lower=self.config.is_lower, init_token=START_TAG, eos_token=STOP_TAG, preprocessing=data.Pipeline(lambda s: re.sub('\d', '0', s) if self.config.is_digit else s)) char = data.NestedField(char_nesting, include_lengths=True) label = data.Field(unk_token=UNLABELED_TAG, batch_first=True) fields = [(('word', 'char'), (word, char)), ('label', label)] # Load train, valid, test datasets all_train_dataset = Conll2003Dataset(examples=all_examples, fields=fields) _, valid_dataset, test_dataset = Conll2003Dataset.splits( fields=fields, path=self.config.dataset_path, separator=" ", train="eng.train", validation="eng.testa", test="eng.testb") # Build vocab word.build_vocab(all_train_dataset, valid_dataset, test_dataset, vectors=GloVe(name='6B', dim='100')) char.build_vocab(all_train_dataset, valid_dataset, test_dataset) label.build_vocab(all_train_dataset, valid_dataset, test_dataset) # UNKNOWN tag is -1 label.vocab.stoi = Counter( {k: v - 1 for k, v in label.vocab.stoi.items()}) # Don't count UNKNOWN tag num_tags = len(label.vocab) - 1 assert label.vocab.stoi[UNLABELED_TAG] == UNLABELED_ID # Create model model = BiLSTM_CRF(num_tags, label.vocab, char.vocab, word.vocab, self.config.emb_dict, dropout_rate=self.config.dropout_rate, inference_type=self.config.inference_type) if self.device != "cpu": model = model.to(self.device) self.trainer_config["path"] = self.base_save_path # Trainer if model_label is not None: self.trainer_config[ "path"] = self.base_save_path + "/all_train/{}".format( model_label) trainer = Trainer(model, self.trainer_config, all_train_dataset, self.valid_dataset, test_dataset=self.test_dataset, label_dict=self.label_dict) trainer.train(num_epochs)
def main(): # Use a GPU if available, as it should be faster. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using device: " + str(device)) # Load the training dataset, and create a data loader to generate a batch. textField = data.Field(lower=True, include_lengths=True, batch_first=True) labelField = data.Field(sequential=False) from imdb_dataloader import IMDB train, dev = IMDB.splits(textField, labelField, train="train", validation="dev") textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50)) labelField.build_vocab(train, dev) trainLoader, testLoader = data.BucketIterator.splits((train, dev), shuffle=True, batch_size=64, sort_key=lambda x: len(x.text), sort_within_batch=True) # Create an instance of the network in memory (potentially GPU memory). Can change to NetworkCnn during development. #net = NetworkLstm().to(device) net = NetworkCnn().to(device) criterion = lossFunc() optimiser = topti.Adam(net.parameters(), lr=0.001) # Minimise the loss using the Adam algorithm. for epoch in range(10): running_loss = 0 # if epoch >= 1: # break for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to( device), batch.label.type(torch.FloatTensor).to(device) # print(inputs) # print(inputs.size()) # print(length) # print(length.size()) # print(labels.size()) labels -= 1 # PyTorch calculates gradients by accumulating contributions to them (useful for # RNNs). Hence we must manually set them to zero before calculating them. optimiser.zero_grad() # Forward pass through the network. output = net(inputs, length) # print(output) # print(labels) loss = criterion(output, labels) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() running_loss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, running_loss / 32)) running_loss = 0 true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0 # Evaluate network on the test dataset. We aren't calculating gradients, so disable autograd to speed up # computations and reduce memory usage. with torch.no_grad(): for batch in testLoader: # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to( device), batch.label.type(torch.FloatTensor).to(device) labels -= 1 outputs = net(inputs, length) tp_batch, tn_batch, fp_batch, fn_batch = measures(outputs, labels) true_pos += tp_batch true_neg += tn_batch false_pos += fp_batch false_neg += fn_batch accuracy = 100 * (true_pos + true_neg) / len(dev) matthews = MCC(true_pos, true_neg, false_pos, false_neg) print("Classification accuracy: %.2f%%\n" "Matthews Correlation Coefficient: %.2f" % (accuracy, matthews))
def classic_train(args): """ Train the model in the ol' fashioned way, just like grandma used to Args args (argparse.ArgumentParser) """ if args.cuda and torch.cuda.is_available(): print("Using cuda") use_cuda = True elif args.cuda and not torch.cuda.is_available(): print("You do not have CUDA, turning cuda off") use_cuda = False else: use_cuda = False #Load the data print("\nLoading Vocab") vocab = du.load_vocab(args.vocab) print("Vocab Loaded, Size {}".format(len(vocab.stoi.keys()))) if args.use_pretrained: pretrained = GloVe(name='6B', dim=args.emb_size, unk_init=torch.Tensor.normal_) vocab.load_vectors(pretrained) print("Vectors Loaded") #Set add_eos to false if you want to decode arbitrarly long conditioned on the latents (done in paper), recommended to set this to false if generating #event sequences (since length is not that important and we dont need the latents capturing it), if generating raw text its probably better to have it on #In the DAVAE class there is a train() fuction that also takes in add_eos, it should match this one print("Loading Dataset") dataset = du.SentenceDataset(args.train_data, vocab, args.src_seq_length, add_eos=False) print("Finished Loading Dataset {} examples".format(len(dataset))) batches = BatchIter(dataset, args.batch_size, sort_key=lambda x: len(x.text), train=True, sort_within_batch=True, device=-1) data_len = len(dataset) if args.load_model: print("Loading the Model") model = torch.load(args.load_model) else: print("Creating the Model") bidir_mod = 2 if args.bidir else 1 latents = example_tree( args.num_latent_values, (bidir_mod * args.enc_hid_size, args.latent_dim), use_cuda=use_cuda, nohier_mode=args.nohier) #assume bidirectional hidsize = (args.enc_hid_size, args.dec_hid_size) model = DAVAE(args.emb_size, hidsize, vocab, latents, layers=args.nlayers, use_cuda=use_cuda, pretrained=args.use_pretrained, dropout=args.dropout) #create the optimizer if args.load_opt: print("Loading the optimizer state") optimizer = torch.load(args.load_opt) else: print("Creating the optimizer anew") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) start_time = time.time() #start of epoch 1 curr_epoch = 1 valid_loss = [0.0] for iteration, bl in enumerate( batches ): #this will continue on forever (shuffling every epoch) till epochs finished batch, batch_lens = bl.text target, target_lens = bl.target if use_cuda: batch = Variable(batch.cuda()) else: batch = Variable(batch) model.zero_grad() latent_values, latent_root, diff, dec_outputs = model( batch, batch_lens) # train set to True so returns total loss loss, _ = monolithic_compute_loss(iteration, model, target, target_lens, latent_values, latent_root, diff, dec_outputs, use_cuda, args=args) # backward propagation loss.backward() # Gradient clipping torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # Optimize optimizer.step() # End of an epoch - run validation if ((args.batch_size * iteration) % data_len == 0 or iteration % args.validate_after == 0) and iteration != 0: print("\nFinished Training Epoch/iteration {}/{}".format( curr_epoch, iteration)) # do validation print("Loading Validation Dataset.") val_dataset = du.SentenceDataset(args.valid_data, vocab, args.src_seq_length, add_eos=False) print("Finished Loading Validation Dataset {} examples.".format( len(val_dataset))) val_batches = BatchIter(val_dataset, args.batch_size, sort_key=lambda x: len(x.text), train=False, sort_within_batch=True, device=-1) valid_loss = 0.0 for v_iteration, bl in enumerate(val_batches): batch, batch_lens = bl.text target, target_lens = bl.target batch_lens = batch_lens.cpu() if use_cuda: batch = Variable(batch.cuda(), volatile=True) else: batch = Variable(batch, volatile=True) latent_values, latent_root, diff, dec_outputs = model( batch, batch_lens) # train set to False so returns only CE loss loss, ce_loss = monolithic_compute_loss(iteration, model, target, target_lens, latent_values, latent_root, diff, dec_outputs, use_cuda, args=args, train=False) valid_loss = valid_loss + ce_loss.data.clone() valid_loss = valid_loss / (v_iteration + 1) print("**Validation loss {:.2f}.**\n".format(valid_loss[0])) # Check max epochs and break if (args.batch_size * iteration) % data_len == 0: curr_epoch += 1 if curr_epoch > args.epochs: print("Max epoch {}-{} reached. Exiting.\n".format( curr_epoch, args.epochs)) break # Save the checkpoint if iteration % args.save_after == 0 and iteration != 0: print("Saving checkpoint for epoch {} at {}.\n".format( curr_epoch, args.save_model)) # curr_epoch and validation stats appended to the model name torch.save( model, "{}_{}_{}_.epoch_{}.loss_{:.2f}.pt".format( args.save_model, args.commit_c, args.commit2_c, curr_epoch, float(valid_loss[0]))) torch.save( optimizer, "{}.{}.epoch_{}.loss_{:.2f}.pt".format(args.save_model, "optimizer", curr_epoch, float(valid_loss[0])))
def train(mode='train', train_path='train.conllx', model='dozat', dev_path='dev.conllx', test_path='test.conllx', ud=True, output_dir='output', emb_dim=0, char_emb_dim=0, char_model=None, tagger=None, batch_size=5000, n_iters=10, dropout_p=0.33, num_layers=1, print_every=1, eval_every=100, bi=True, lr=0.001, adam_beta1=0.9, adam_beta2=0.999, weight_decay=0., plateau=False, resume=False, lr_decay=1.0, lr_decay_steps=5000, clip=5., momentum=0, optimizer='adam', glove=True, seed=42, dim=0, window_size=0, num_filters=0, **kwargs): device = torch.device(type='cuda') if use_cuda else torch.device( type='cpu') if not os.path.exists(output_dir): os.makedirs(output_dir) cfg = locals().copy() torch.manual_seed(seed) np.random.seed(seed) # load data component dataset_obj = ConllXDataset fields = get_data_fields() _form = fields['form'][-1] _pos = fields['pos'][-1] _chars = fields['chars'][-1] train_dataset = dataset_obj(train_path, fields) dev_dataset = dataset_obj(dev_path, fields) test_dataset = dataset_obj(test_path, fields) logger.info("Loaded %d train examples" % len(train_dataset)) logger.info("Loaded %d dev examples" % len(dev_dataset)) logger.info("Loaded %d test examples" % len(test_dataset)) form_vocab_path = os.path.join(output_dir, 'vocab.form.pth.tar') pos_vocab_path = os.path.join(output_dir, 'vocab.pos.pth.tar') char_vocab_path = os.path.join(output_dir, 'vocab.char.pth.tar') if not resume: # build vocabularies # words have a min frequency of 2 to be included; others become <unk> # words without a Glove vector are initialized ~ N(0, 0.5) mimicking Glove # Note: this requires the latest torchtext development version from Github. # - git clone https://github.com/pytorch/text.git torchtext # - cd torchtext # - python setup.py build # - python setup.py install def unk_init(x): # return 0.01 * torch.randn(x) return torch.zeros(x) if glove: logger.info("Using Glove vectors") glove_vectors = GloVe(name='6B', dim=100) _form.build_vocab(train_dataset, min_freq=2, unk_init=unk_init, vectors=glove_vectors) n_unks = 0 unk_set = set() # for now, set UNK words manually # (torchtext does not seem to support it yet) for i, token in enumerate(_form.vocab.itos): if token not in glove_vectors.stoi: n_unks += 1 unk_set.add(token) _form.vocab.vectors[i] = unk_init(emb_dim) # print(n_unks, unk_set) else: _form.build_vocab(train_dataset, min_freq=2) _pos.build_vocab(train_dataset) _chars.build_vocab(train_dataset) # save vocabularies torch.save(_form.vocab, form_vocab_path) torch.save(_pos.vocab, pos_vocab_path) torch.save(_chars.vocab, char_vocab_path) else: # load vocabularies _form.vocab = torch.load(form_vocab_path) _pos.vocab = torch.load(pos_vocab_path) _chars.vocab = torch.load(char_vocab_path) print("First 10 vocabulary entries, words: ", " ".join(_form.vocab.itos[:10])) print("First 10 vocabulary entries, pos tags: ", " ".join(_pos.vocab.itos[:10])) print("First 10 vocabulary entries, chars: ", " ".join(_chars.vocab.itos[:10])) n_words = len(_form.vocab) n_tags = len(_pos.vocab) n_chars = len(_chars.vocab) def batch_size_fn(new, count, sofar): return len(new.form) + 1 + sofar # iterators train_iter = Iterator(train_dataset, batch_size, train=True, sort_within_batch=True, batch_size_fn=batch_size_fn, device=device) dev_iter = Iterator(dev_dataset, 32, train=False, sort_within_batch=True, device=device) test_iter = Iterator(test_dataset, 32, train=False, sort_within_batch=True, device=device) # uncomment to see what a mini-batch looks like numerically # e.g. some things are being inserted dynamically (ROOT at the start of seq, # padding items, maybe UNKs..) # batch = next(iter(train_iter)) # print("form", batch.form) # print("pos", batch.pos) # print("deprel", batch.deprel) # print("head", batch.head) # if n_iters or eval_every are negative, we set them to that many # number of epochs iters_per_epoch = (len(train_dataset) // batch_size) + 1 if eval_every < 0: logger.info("Setting eval_every to %d epoch(s) = %d iters" % (-1 * eval_every, -1 * eval_every * iters_per_epoch)) eval_every = iters_per_epoch * eval_every if n_iters < 0: logger.info("Setting n_iters to %d epoch(s) = %d iters" % (-1 * n_iters, -1 * n_iters * iters_per_epoch)) n_iters = -1 * n_iters * iters_per_epoch # load up the model model = Tagger(n_words=n_words, n_tags=n_tags, n_chars=n_chars, form_vocab=_form.vocab, char_vocab=_chars.vocab, pos_vocab=_pos.vocab, **cfg) # set word vectors if glove: _form.vocab.vectors = _form.vocab.vectors / torch.std( _form.vocab.vectors) # print(torch.std(_form.vocab.vectors)) model.encoder.embedding.weight.data.copy_(_form.vocab.vectors) model.encoder.embedding.weight.requires_grad = True model = model.cuda() if use_cuda else model start_iter = 1 best_iter = 0 best_pos_acc = -1. test_pos_acc = -1. # optimizer and learning rate scheduler trainable_parameters = [p for p in model.parameters() if p.requires_grad] if optimizer == 'sgd': optimizer = torch.optim.SGD(trainable_parameters, lr=lr, momentum=momentum) else: optimizer = torch.optim.Adam(trainable_parameters, lr=lr, betas=(adam_beta1, adam_beta2)) # learning rate schedulers if not plateau: scheduler = LambdaLR(optimizer, lr_lambda=lambda t: lr_decay**t) else: scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.75, patience=5, min_lr=1e-4) # load model and vocabularies if resuming if resume: if os.path.isfile(resume): print("=> loading checkpoint '{}'".format(resume)) checkpoint = torch.load(resume) start_iter = checkpoint['iter_i'] best_pos_acc = checkpoint['best_pos_acc'] test_pos_acc = checkpoint['test_pos_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (iter {})".format( resume, checkpoint['iter_i'])) else: print("=> no checkpoint found at '{}'".format(resume)) print_parameters(model) # print some stuff just for fun logger.info("Most common words: %s" % _form.vocab.freqs.most_common(20)) logger.info("Word vocab size: %s" % n_words) logger.info("Most common XPOS-tags: %s" % _pos.vocab.freqs.most_common()) logger.info("POS vocab size: %s" % n_tags) # logger.info("Most common chars: %s" % _chars.nesting_field.vocab.freqs.most_common()) logger.info("Chars vocab size: %s" % n_chars) print("First training example:") print_example(train_dataset[0]) print("First dev example:") print_example(dev_dataset[0]) print("First test example:") print_example(test_dataset[0]) logger.info("Training starts..") upos_var, morph_var = None, None for iter_i in range(start_iter, n_iters + 1): if not plateau and iter_i % (912344 // batch_size) == 0: scheduler.step() model.train() batch = next(iter(train_iter)) form_var, lengths = batch.form pos_var = batch.pos char_var, sentence_lengths, word_lengths = batch.chars lengths = lengths.view(-1).tolist() result = model(form_var=form_var, char_var=char_var, pos_var=pos_var, lengths=lengths, word_lengths=word_lengths) # rows sum to 1 # print(torch.exp(output_graph).sum(-1)) # print sizes # print(head_logits.data.cpu().size()) targets = dict(pos=batch.pos) all_losses = model.get_loss(scores=result, targets=targets) loss = all_losses['loss'] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step() optimizer.zero_grad() if iter_i % print_every == 0: # get scores for this batch if model.tagger == "linear": pos_predictions = result['output'].max(2)[1] else: pos_predictions = result['sequence'] predictions = dict(pos=pos_predictions) targets = dict(pos=batch.pos) pos_acc = model.get_accuracy(predictions=predictions, targets=targets) if not plateau: lr = scheduler.get_lr()[0] else: lr = [group['lr'] for group in optimizer.param_groups][0] fmt = "Iter %08d loss %8.4f pos-acc %5.2f lr %.5f" logger.info(fmt % (iter_i, loss, pos_acc, lr)) if iter_i % eval_every == 0: # parse dev set and save to file for official evaluation dev_out_path = 'dev.iter%08d.conll' % iter_i dev_out_path = os.path.join(output_dir, dev_out_path) predict_and_save(dataset=dev_dataset, model=model, dataset_path=dev_path, out_path=dev_out_path) _dev_pos_acc = get_pos_acc(dev_path, dev_out_path) logger.info("Evaluation dev Iter %08d " "pos-acc %5.2f" % (iter_i, _dev_pos_acc)) # parse test set and save to file for official evaluation test_out_path = 'test.iter%08d.conll' % iter_i test_out_path = os.path.join(output_dir, test_out_path) predict_and_save(dataset=test_dataset, model=model, dataset_path=test_path, out_path=test_out_path) _test_pos_acc = get_pos_acc(test_path, test_out_path) logger.info("Evaluation test Iter %08d " "pos-acc %5.2f" % (iter_i, _test_pos_acc)) if plateau: scheduler.step(_dev_pos_acc) if _dev_pos_acc > best_pos_acc: best_iter = iter_i best_pos_acc = _dev_pos_acc test_pos_acc = _test_pos_acc is_best = True else: is_best = False save_checkpoint( output_dir, { 'iter_i': iter_i, 'state_dict': model.state_dict(), 'best_iter': best_iter, 'test_pos_acc': test_pos_acc, 'optimizer': optimizer.state_dict(), }, False) logger.info("Done Training") logger.info( "Best model Iter %08d Dev POS-acc %12.4f Test POS-acc %12.4f " % (best_iter, best_pos_acc, test_pos_acc))
args = get_args() torch.cuda.set_device(args.gpu) inputs = data.Field(lower=args.lower) answers = data.Field(sequential=False) train, dev, test = datasets.SNLI.splits(inputs, answers) inputs.build_vocab(train, dev, test) if args.word_vectors: if os.path.isfile(args.vector_cache): inputs.vocab.vectors = torch.load(args.vector_cache) else: # FIXME: quick fix. Do bring the actual arguments inputs.vocab.load_vectors(vectors=GloVe(name='6B', dim=300)) makedirs(os.path.dirname(args.vector_cache)) torch.save(inputs.vocab.vectors, args.vector_cache) answers.build_vocab(train) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_size=args.batch_size, device=args.gpu) config = args config.n_embed = len(inputs.vocab) config.d_out = len(answers.vocab) config.n_cells = config.n_layers # double the number of cells for bidirectional networks if config.birnn: config.n_cells *= 2
def to_cmdline_kwarg(key, value): if len(key) == 1: key = "-{}".format(key) else: key = "--{}".format(re.sub(r"_", "-", key)) value = str(value) return key, value kwargs_pairs = (to_cmdline_kwarg(key, value) for key, value in kwargs.items()) cmdline_args = list(sum(kwargs_pairs, ())) args = parser.parse_args(cmdline_args) VECTORS = { "GloVe": GloVe(name='6B', dim=300), # "FastText": FastText() } def create_data_loaders(args): LOG.info("importing IMDB dataset") train_dataset, eval_dataset, = \ data.make_imdb_dataset(args.total_num_labeled, VECTORS[args.vectors], args.exclude_unlabeled, args.seed, args.use_gpu) LOG.info("building torchtext iterators") if args.total_num_labeled == -1: train_iter = tdata.BucketIterator( dataset=train_dataset, batch_size=args.batch_size, sort_key=lambda x: len(x.text),
def load(self): if self.run_mode == 'word': WORD_MODE_FIELD = data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True) self.field = WORD_MODE_FIELD if self.run_mode == 'bert': # Load the BERT tokenizer. print('Loading BERT tokenizer...') self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.max_input_length = self.tokenizer.max_model_input_sizes['bert-base-uncased'] def tokenize_and_cut(sentence): tokens = self.tokenizer.tokenize(sentence) # tokens = tokens[:max_input_length-2] return tokens BERT_FIELD = data.Field(batch_first = True, use_vocab = False, tokenize = tokenize_and_cut, preprocessing = self.tokenizer.convert_tokens_to_ids, init_token = self.tokenizer.cls_token_id, eos_token = self.tokenizer.sep_token_id, pad_token = self.tokenizer.pad_token_id, unk_token = self.tokenizer.unk_token_id) self.field = BERT_FIELD if self.run_mode == 'sentence': WORD_FIELD = data.Field(sequential=True, lower=True, tokenize='spacy') SENTENCE_FIELD = data.NestedField(WORD_FIELD, tokenize=sent_tokenize, include_lengths=True) self.field = SENTENCE_FIELD def split_start(x, y): idx = x[0].split(",")[0] if idx == 'None': return int(-1) else: return int(idx) process_start = data.Pipeline(split_start) START_INDEX = data.Field(sequential=True, postprocessing=process_start, use_vocab=False) def split_end(x, y): if len(x[0].split(",")) > 1: idx = x[0].split(",")[len(x[0].split(",")) - 1] if idx == 'None': return int(-1) else: return int(idx) else: idx = x[0].split(",")[0] if idx == 'None': return int(-1) else: return int(idx) process_end = data.Pipeline(split_end) END_INDEX = data.Field(sequential=True, postprocessing=process_end, use_vocab=False) def floor_label(x, y): return math.floor(float(x[0])) process_answerable = data.Pipeline(floor_label) ANSWERABLE = data.Field(sequential=True, postprocessing=process_answerable, use_vocab=False) if self.run_mode == 'word': col_dict = {'story_text': WORD_MODE_FIELD, 'question': WORD_MODE_FIELD, 'word_start_index_1': START_INDEX, 'word_end_index_1': END_INDEX, 'is_answer_absent': ANSWERABLE} elif self.run_mode == 'bert': col_dict = {'story_text': BERT_FIELD, 'question': BERT_FIELD, 'word_start_index_1': START_INDEX, 'word_end_index_1': END_INDEX, 'is_answer_absent': ANSWERABLE} elif self.run_mode == 'sentence': col_dict = {'story_text': SENTENCE_FIELD, 'question': WORD_FIELD, 'word_start_index_1': START_INDEX, 'word_end_index_1': END_INDEX, 'is_answer_absent': ANSWERABLE} def populateDatafields(somedf, col_dict): datafields = [] for col in somedf.columns: if col in col_dict.keys(): datafields.append((col, col_dict[col])) else: datafields.append((col, None)) return datafields newsqa_df = pd.read_csv(self.train_data_path) datafields = populateDatafields(newsqa_df, col_dict) print("Building Dataset...") self.training_data=data.TabularDataset(path = self.train_data_path,\ format = 'csv',\ fields = datafields,\ skip_header = True) self.validation_data=data.TabularDataset(path = self.val_data_path,\ format = 'csv',\ fields = datafields,\ skip_header = True) if self.verbose: count = 0 for t in self.training_data: print("*******************************") print("Story Text: ", len(t.story_text), t.story_text) print("Question: ", t.question) print("Start Index: ", t.word_start_index_1) print("End Index: ", t.word_end_index_1) print("Unanswerable: ", t.is_answer_absent) if count > 5: break count += 1 print("Building Vocab...") if self.run_mode == 'word': WORD_MODE_FIELD.build_vocab(self.training_data, self.validation_data, min_freq = 3, vectors=GloVe(name = '6B', dim = 300)) if self.verbose: print("Length of Vocab: ", len(WORD_MODE_FIELD.vocab)) elif self.run_mode == 'sentence': SENTENCE_FIELD.build_vocab(self.training_data, self.validation_data, min_freq = 3, vectors=GloVe(name = '6B', dim = 300)) if self.verbose: print("Length of Vocab: ", len(SENTENCE_FIELD.vocab)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Initializing the iterator...") # Define the train iterator self.train_iterator = data.BucketIterator( self.training_data, batch_size = self.batch_size, sort_key = lambda x: len(x.story_text), sort_within_batch = True, repeat=False, shuffle=True, device = device) self.val_iterator = data.BucketIterator( self.validation_data, batch_size = 1, sort_key = lambda x: len(x.story_text), sort_within_batch = False, sort=False, repeat=False, shuffle=False, device = device) if self.verbose: for batch in self.train_iterator: print("Story: ", batch.story_text[0].shape, batch.story_text[1].shape) print("Start/End: ", batch.word_start_index_1, batch.word_end_index_1, batch.is_answer_absent) break
def init_GloVe(self, name, dim, cache=None): return GloVe(name, dim, cache=cache)
tgt = TargetField(stop_words=stopwords) max_len = 100 def len_filter(example): return len(example.src) <= max_len and len(example.tgt) <= max_len train = torchtext.data.TabularDataset( path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) dev = torchtext.data.TabularDataset( path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter ) src.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300)) tgt.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300)) input_vocab = src.vocab output_vocab = tgt.vocab print(len(train), len(dev)) # NOTE: If the source field name and the target field name # are different from 'src' and 'tgt' respectively, they have # to be set explicitly before any training or inference # seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt' # Prepare loss weight = torch.ones(len(tgt.vocab)) pad = tgt.vocab.stoi[tgt.pad_token] loss = Perplexity(weight, pad)
def __init__(self, args): path = '.data/squad' dataset_path = path + '/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path + 'dev_examples.pt' print("preprocessing data files...") if not os.path.exists(f'{path}/{args.train_file}l'): self.preprocess_file(f'{path}/{args.train_file}') if not os.path.exists(f'{path}/{args.dev_file}l'): self.preprocess_file(f'{path}/{args.dev_file}') self.RAW = data.RawField() # explicit declaration for torchtext compatibility self.RAW.is_target = False self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'id': ('id', self.RAW), 's_idx': ('s_idx', self.LABEL), 'e_idx': ('e_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)] } list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('q_word', self.WORD), ('q_char', self.CHAR)] if os.path.exists(dataset_path): print("loading splits...") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples, fields=list_fields) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) else: print("building splits...") self.train, self.dev = data.TabularDataset.splits( path=path, train=f'{args.train_file}l', validation=f'{args.dev_file}l', format='json', fields=dict_fields) os.makedirs(dataset_path) torch.save(self.train.examples, train_examples_path) torch.save(self.dev.examples, dev_examples_path) #cut too long context in the training set for efficiency. if args.context_threshold > 0: self.train.examples = [ e for e in self.train.examples if len(e.c_word) <= args.context_threshold ] print("building vocab...") self.CHAR.build_vocab(self.train, self.dev) self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim)) print("building iterators...") device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") self.train_iter, self.dev_iter = \ data.BucketIterator.splits((self.train, self.dev), batch_sizes=[args.train_batch_size, args.dev_batch_size], device=device, sort_key=lambda x: len(x.c_word))
def load_dataset(test_sen=None): print("in load_dataset") """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ # def tokenizer(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=200) LABEL = data.Field(tensor_type=torch.FloatTensor, sequential=False) # tokenize = lambda x: x.split() # TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) # LABEL = data.LabelField(tensor_type=torch.FloatTensor) # train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) # print('data loaded') train_data, valid_data, test_data = data.TabularDataset.splits( path='../Github/Data/author_identification/', train='train.csv', validation='val.csv', test='test.csv', format='csv', fields=[('id', None), ('text', TEXT), ('author', LABEL)]) # # TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) # LABEL.build_vocab(train_data) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) from torchtext.data import Iterator, BucketIterator # train_iter, valid_iter = BucketIterator.splits( # (train_data, valid_data), # we pass in the datasets we want the iterator to draw data from # batch_size=64, # device=-1, # if you want to use the GPU, specify the GPU number here # sort_key=lambda x: len(x.text), # # the BucketIterator needs to be told what function it should use to group the data. # sort_within_batch=False, # repeat=False # we pass repeat=False because we want to wrap this Iterator layer. # ) # test_iter = Iterator(test_data, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False) # train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) '''Alternatively we can also use the default configurations''' # train_iter, test_iter = datasets.IMDB.iters(batch_size=32) vocab_size = len(TEXT.vocab) return TEXT, LABEL, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
def train(data_path, train_path, val_path, test_path, hidden_size, num_classes, num_layers, num_dir, batch_size, emb_dim, dropout, net_type, embfix): print('Training...') # define fields TEXT = data.Field(lower=True, init_token="<start>", eos_token="<end>") LABEL = data.Field(sequential=False, unk_token=None) # build dataset splits train, val, test = data.TabularDataset.splits(path=data_path, train=train_path, validation=val_path, test=test_path, format='tsv', fields=[('text', TEXT), ('label', LABEL)]) # build vocabs TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=emb_dim), min_freq=2) prevecs = TEXT.vocab.vectors #TEXT.build_vocab(train, min_freq=3) LABEL.build_vocab(train) # build iterators train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.text), train=True) val_iter = data.Iterator(val, batch_size=batch_size, repeat=False, train=False, sort=False, shuffle=False) test_iter = data.Iterator(test, batch_size=batch_size, repeat=False, train=False, sort=False, shuffle=False) # print info print(max(LABEL.vocab.freqs.values())) print('num_classes: ', len(LABEL.vocab)) print('input_size: ', len(TEXT.vocab)) print('majority class acc:', max(LABEL.vocab.freqs.values()) / len(train)) print('random guess acc:', (max(LABEL.vocab.freqs.values()) / len(train))**2 + (min(LABEL.vocab.freqs.values()) / len(train))**2) num_classes = len(LABEL.vocab) input_size = len(TEXT.vocab) model = RNN(input_size=input_size, hidden_size=hidden_size, num_classes=num_classes, prevecs=prevecs, num_layers=num_layers, num_dir=num_dir, batch_size=batch_size, emb_dim=emb_dim, embfix=embfix, dropout=dropout, net_type=net_type) epochs = 100 criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adamax(model.parameters()) #optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.5) if int(torch.cuda.is_available()) == 1: model = model.cuda() # train model.train() best_val_acc = 0 for e in range(epochs): print('Epoch:', e) tot_loss = 0 corrects = 0 train_iter.repeat = False for batch_count, batch in enumerate(train_iter): #print('Batch:', batch_count) #print(batch.text) #print(batch.label) model.zero_grad() inp = batch.text.t() preds = model(inp) target = batch.label #print(preds, batch.label) loss = criterion(preds, batch.label) loss.backward() optimizer.step() _, preds = torch.max(preds, 1) corrects += int(preds.data.eq(target.data).sum()) tot_loss += loss.data[0] print('acc (train):', 100 * corrects / len(train_iter.dataset)) print('loss (train):', tot_loss) val_acc, _, val_loss = evaluate(val_iter, model, TEXT, LABEL) print('acc (val):', val_acc) print('loss (val):', val_loss) if val_acc > best_val_acc: test_acc, test_preds, test_loss = evaluate(test_iter, model, TEXT, LABEL) #print('Test acc:', test_acc) f = open('./preds/preds_' + str(e) + '.txt', 'w') for x in test_preds: f.write(str(int(x)) + '\n') f.close() torch.save(model.state_dict(), './models/e' + str(e) + '_' + str(val_acc) + '.pt')
# We'll use NestedField to tokenize each word into list of chars CHAR_NESTING = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>") CHAR = data.NestedField(CHAR_NESTING, init_token="<bos>", eos_token="<eos>") fields = [(('word', 'char'), (WORD, CHAR)), (None, None), ('ptbtag', PTB_TAG)] train, val, test = datasets.UDPOS.splits(fields=fields) print(train.fields) print(len(train)) print(vars(train[0])) WORD.build_vocab(train.word, val.word, test.word, vectors=[GloVe(name='6B', dim='300')]) CHAR.build_vocab(train.char, val.char, test.char) PTB_TAG.build_vocab(train.ptbtag) print(CHAR.vocab.freqs) train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=3) batch = next(iter(train_iter)) print("words", batch.word) print("chars", batch.char) print("ptbtags", batch.ptbtag) # Using the CoNLL 2000 Chunking dataset: INPUTS = data.Field(init_token="<bos>", eos_token="<eos>") CHUNK_TAGS = data.Field(init_token="<bos>", eos_token="<eos>")
import torch import torch.nn as tnn import torch.optim as topti from torchtext import data from torchtext.vocab import GloVe textField = data.Field(lower=True, include_lengths=True, batch_first=True) labelField = data.Field(sequential=False) train, dev = IMDB.splits(textField, labelField, train="train", validation="dev") print(textField) textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50)) labelField.build_vocab(train, dev) trainLoader, testLoader = data.BucketIterator.splits( (train, dev), shuffle=True, batch_size=64, sort_key=lambda x: len(x.text), sort_within_batch=True) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # j = 0 # for i, batch in enumerate(trainLoader): # # Get a batch and potentially send it to GPU memory. # inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to( # device), batch.label.type(torch.FloatTensor).to(device)
def return_data(args): name = args.dataset root = args.root batch_size = args.batch_size data_loader = dict() device = 0 if args.cuda else -1 if name in ['mnist', 'MNIST']: transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), ]) train_kwargs = { 'root': root, 'mode': 'train', 'transform': transform, 'download': True, 'load_pred': args.load_pred, 'model_name': args.model_name } valid_kwargs = { 'root': root, 'mode': 'valid', 'transform': transform, 'download': True, 'load_pred': args.load_pred, 'model_name': args.model_name } test_kwargs = { 'root': root, 'mode': 'test', 'transform': transform, 'download': False, 'load_pred': args.load_pred, 'model_name': args.model_name } dset = MNIST_modified train_data = dset(**train_kwargs) valid_data = dset(**valid_kwargs) test_data = dset(**test_kwargs) # data loader num_workers = 0 train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True, pin_memory=True) valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False, pin_memory=True) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False, pin_memory=True) data_loader[ 'x_type'] = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor data_loader[ 'y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor elif name in ['imdb', 'IMDB']: embedding_dim = 100 max_total_num_words = 20000 text = data.Field(tokenize=tokenizer_twolevel, batch_first=True) label = data.Field(lower=True) label_pred = data.Field(use_vocab=False, fix_length=1) fname = data.Field(use_vocab=False, fix_length=1) train, valid, test = IMDB_modified.splits(text, label, label_pred, fname, root=root, model_name=args.model_name, load_pred=args.load_pred) print("build vocab...") text.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_dim, cache=root), max_size=max_total_num_words) label.build_vocab(train) #label_pred.build_vocab(train) print("Create Iterator objects for multiple splits of a dataset...") train_loader, valid_loader, test_loader = data.Iterator.splits( (train, valid, test), batch_size=batch_size, device=device, repeat=False) data_loader['word_idx'] = text.vocab.itos data_loader[ 'x_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor data_loader[ 'y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor data_loader['max_total_num_words'] = max_total_num_words data_loader['embedding_dim'] = embedding_dim data_loader['max_num_words'] = 50 #_, (text, _, _, _) = next(iter(train_loader)) data_loader['max_num_sents'] = int( next(iter(train_loader)).text.size(-1) / data_loader['max_num_words']) else: raise UnknownDatasetError() data_loader['train'] = train_loader data_loader['valid'] = valid_loader data_loader['test'] = test_loader return data_loader
def main(): # Use a GPU if available, as it should be faster. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using device: " + str(device)) # Load the training dataset, and create a data loader to generate a batch. textField = PreProcessing.text_field labelField = data.Field(sequential=False) #print(1234) train, dev = IMDB.splits(textField, labelField, train="train", validation="dev") #print(1234) textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50)) labelField.build_vocab(train, dev) trainLoader, testLoader = data.BucketIterator.splits((train, dev), shuffle=True, batch_size=64, sort_key=lambda x: len(x.text), sort_within_batch=True) net = Network().to(device) criterion =lossFunc() optimiser = topti.Adam(net.parameters(), lr=0.001) # Minimise the loss using the Adam algorithm. for epoch in range(10): running_loss = 0 for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to( device), batch.label.type(torch.FloatTensor).to(device) labels -= 1 # PyTorch calculates gradients by accumulating contributions to them (useful for # RNNs). Hence we must manually set them to zero before calculating them. optimiser.zero_grad() # Forward pass through the network. output = net(inputs, length) loss = criterion(output, labels) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() running_loss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, running_loss / 32)) running_loss = 0 num_correct = 0 # Save mode torch.save(net.state_dict(), "./model.pth") print("Saved model") # Evaluate network on the test dataset. We aren't calculating gradients, so disable autograd to speed up # computations and reduce memory usage. with torch.no_grad(): for batch in testLoader: # Get a batch and potentially send it to GPU memory. inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to( device), batch.label.type(torch.FloatTensor).to(device) labels -= 1 # Get predictions outputs = torch.sigmoid(net(inputs, length)) predicted = torch.round(outputs) num_correct += torch.sum(labels == predicted).item() accuracy = 100 * num_correct / len(dev) print(f"Classification accuracy: {accuracy}")
def conll2003_dataset(tag_type, batch_size, root='./conll2003', train_file='eng.train.txt', validation_file='eng.testa.txt', test_file='eng.testb.txt', convert_digits=True): """ conll2003: Conll 2003 (Parser only. You must place the files) Extract Conll2003 dataset using torchtext. Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets up per word character Field Parameters: tag_type: Type of tag to pick as task [pos, chunk, ner] batch_size: Batch size to return from iterator root: Dataset root directory train_file: Train filename validation_file: Validation filename test_file: Test filename convert_digits: If True will convert numbers to single 0's Returns: A dict containing: task: 'conll2003.' + tag_type iters: (train iter, validation iter, test iter) vocabs: (Inputs word vocabulary, Inputs character vocabulary, Tag vocabulary ) """ # Setup fields with batch dimension first inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] + [('labels', labels) if label == tag_type else (None, None) for label in ['pos', 'chunk', 'ner']]) # Load the data train, val, test = SequenceTaggingDataset.splits( path=root, train=train_file, validation=validation_file, test=test_file, separator=' ', fields=tuple(fields)) # Build vocab inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char) inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000, vectors=[GloVe(name='6B', dim='200'), CharNGram()]) labels.build_vocab(train.labels) # Get iterators train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) train_iter.repeat = False return { 'task': 'conll2003.%s' % tag_type, 'iters': (train_iter, val_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) }
def train(**args): params = EasyDict(args) params.gpu = int(params.gpu) config = ConfigParser() config.read('config.ini') if params.datasets == ['all']: params.datasets = ['imdb', 'amazon', 'yelp', 'rottentomatoes', 'hotel'] is_tokenizer_length_dataset_specific = Models(params.model) == Models.distilbert and ( params.tokenizer_length is None or params.tokenizer_length) is_number_prototypes_dataset_specific = Models(params.model) == Models.protoconv and ( params.pc_number_of_prototypes is None or params.pc_number_of_prototypes == -1) is_sep_loss_dataset_specific = Models(params.model) == Models.protoconv and ( params.pc_sep_loss_weight is None or params.pc_sep_loss_weight == -1) if_ce_loss_dataset_specific = Models(params.model) == Models.protoconv and ( params.pc_ce_loss_weight is None or params.pc_ce_loss_weight == -1) for dataset in params.datasets: params.data_set = dataset seed_everything(params.seed) if is_tokenizer_length_dataset_specific: params.tokenizer_length = dataset_tokens_length[params.data_set] if is_number_prototypes_dataset_specific: params.pc_number_of_prototypes = dataset_to_number_of_prototypes[params.data_set] if is_sep_loss_dataset_specific: params.pc_sep_loss_weight = dataset_to_separation_loss[params.data_set] if if_ce_loss_dataset_specific: weight = 1 - (params.pc_cls_loss_weight + params.pc_sep_loss_weight + params.pc_l1_loss_weight) assert weight > 0, f'Weight {weight} of cross entropy loss cannot be less or equal to 0' params.pc_ce_loss_weight = weight logger = DummyLogger() if params.logger: comet_config = EasyDict(config['cometml']) project_name = params.project_name if params.project_name else comet_config.projectname logger = CometLogger(api_key=comet_config.apikey, project_name=project_name, workspace=comet_config.workspace) # logger.experiment.log_code(folder='src') logger.log_hyperparams(params) base_callbacks = [LearningRateMonitor(logging_interval='epoch')] df_dataset = pd.read_csv(f'data/{params.data_set}/tokenized_data.csv') n_splits = get_n_splits(dataset=df_dataset, x_label='text', y_label='label', folds=params.fold) log_splits(n_splits, logger) embeddings = GloVe('42B', cache=params.cache) if Models(params.model) != Models.distilbert else None best_models_scores, number_of_prototypes = [], [] for fold_id, (train_index, val_index, test_index) in enumerate(n_splits): i = str(fold_id) model_checkpoint = ModelCheckpoint( filepath='checkpoints/fold_' + i + '_{epoch:02d}-{val_loss_' + i + ':.4f}-{val_acc_' + i + ':.4f}', save_weights_only=True, save_top_k=1, monitor='val_acc_' + i, period=params.pc_project_prototypes_every_n ) early_stop = EarlyStopping(monitor=f'val_loss_{i}', patience=10, verbose=True, mode='min', min_delta=0.005) callbacks = deepcopy(base_callbacks) + [model_checkpoint, early_stop] lit_module = model_to_litmodule[params.model] train_df, valid_df = df_dataset.iloc[train_index + val_index], df_dataset.iloc[test_index] model, train_loader, val_loader, *utils = lit_module.from_params_and_dataset(train_df, valid_df, params, fold_id, embeddings) trainer = Trainer(auto_lr_find=params.find_lr, logger=logger, max_epochs=params.epoch, callbacks=callbacks, gpus=params.gpu, deterministic=True, fast_dev_run=params.fast_dev_run, num_sanity_val_steps=0) trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader) for absolute_path in model_checkpoint.best_k_models.keys(): logger.experiment.log_model(Path(absolute_path).name, absolute_path) if model_checkpoint.best_model_score: best_models_scores.append(model_checkpoint.best_model_score.tolist()) logger.log_metrics({'best_model_score_' + i: model_checkpoint.best_model_score.tolist()}, step=0) if Models(params.model) == Models.protoconv and model_checkpoint.best_model_path: best_model = lit_module.load_from_checkpoint(model_checkpoint.best_model_path) saved_number_of_prototypes = sum(best_model.enabled_prototypes_mask.tolist()) number_of_prototypes.append(saved_number_of_prototypes) logger.log_hyperparams({ f'saved_prototypes_{fold_id}': saved_number_of_prototypes, f'best_model_path_{fold_id}': str(Path(model_checkpoint.best_model_path).name) }) if params.pc_visualize: data_visualizer = DataVisualizer(best_model) logger.experiment.log_html(f'<h1>Split {fold_id}</h1><br> <h3>Prototypes:</h3><br>' f'{data_visualizer.visualize_prototypes()}<br>') logger.experiment.log_figure(f'Prototypes similarity_{fold_id}', data_visualizer.visualize_similarity().figure) logger.experiment.log_html(f'<h3>Random prediction explanations:</h3><br>' f'{data_visualizer.visualize_random_predictions(val_loader, n=15)}') if len(best_models_scores) >= 1: avg_best, std_best = float(np.mean(np.array(best_models_scores))), float( np.std(np.array(best_models_scores))) table_entry = f'{avg_best:.3f} ($\pm${std_best:.3f})' logger.log_hyperparams({ 'avg_best_scores': avg_best, 'std_best_scores': std_best, 'table_entry': table_entry }) if len(number_of_prototypes) >= 1: logger.log_hyperparams({'avg_saved_prototypes': float(np.mean(np.array(number_of_prototypes)))}) logger.experiment.end()
def load_dataset(test_sen=None): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ # TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = data.LabelField(tensor_type=torch.cuda.FloatTensor) INDEX = data.Field(tensor_type=torch.cuda.LongTensor) TEXT = data.Field(sequential=True, fix_length=20000, tokenize=tokenizer, pad_first=True, tensor_type=torch.cuda.LongTensor, lower=True, batch_first=True) train_data, test_data = data.TabularDataset.splits( path='.', format='csv', skip_header=True, train='blogs_training.csv', validation='blogs_testing.csv', fields=[('index', None), ('text', TEXT), ('fileIndex', None), ('label', LABEL), ('age', None), ('industry', None), ('hscope', None)]) # train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train_data, vectors=GloVe(name='twitter.27B', dim=100)) LABEL.build_vocab(train_data) pickle.dump(TEXT, open("TEXT.pickle", "wb")) word_embeddings = TEXT.vocab.vectors print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) train_data, valid_data = train_data.split( ) # Further splitting of training_data to create new training_data & validation_data train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) '''Alternatively we can also use the default configurations''' # train_iter, test_iter = datasets.IMDB.iters(batch_size=32) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
fields = {'label': ('label', LABEL), 'input': ('input', INPUT)} train, valid = data.TabularDataset.splits(path='/jet/prs/workspace', train='train.json', test='valid.json', format='json', fields=fields) # In[8]: print(vars(train[0])) # In[9]: INPUT.build_vocab(train, vectors=GloVe(name='6B', dim=300), max_size=10000, min_freq=10) LABEL.build_vocab(train, ) # In[ ]: # FOR DEBUGGING ONLY #print(INPUT.vocab.freqs) #print(INPUT.vocab.vectors) #print(INPUT.vocab.stoi) INPUT = data.Field(fix_length=50, batch_first=False) LABEL = data.Field(sequential=False, )
inputs = data.Field(lower=True, include_lengths=True, batch_first=True) print('Generating train, dev, test splits') train, dev, test = datasets.IWSLT.splits(root=args.data, exts=['.en', '.de'], fields=[inputs, inputs]) train_iter, dev_iter, test_iter = data.Iterator.splits( (train, dev, test), batch_size=100, device=torch.device(args.device) if args.device >= 0 else None) print('Building vocabulary') inputs.build_vocab(train, dev, test) inputs.vocab.load_vectors( vectors=GloVe(name='840B', dim=300, cache=args.embeddings)) outputs_last_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors) outputs_both_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True) outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, residual_embeddings=True) if args.device >= 0: outputs_last_layer_cove.cuda() outputs_both_layer_cove.cuda() outputs_both_layer_cove_with_glove.cuda()
def main(): patience_counter = 0 # handle and display arguments args = parser.parse_args() pprint.PrettyPrinter().pprint(args.__dict__) # handling timestamp: cur_date = datetime.now() now_str = '%d-%d-%d_%d:%d' % (cur_date.year, cur_date.month, cur_date.day, cur_date.hour, cur_date.minute) model_path, learning_curve_path, roc_curve_path, conf_mat_path, norm_conf_mat_path, args_path, roc_curve_path_ext_swapped ,roc_curve_path_ext_regular = set_plots_model_names( now_str, args) args.norm_conf_mat_path = norm_conf_mat_path args.roc_curve_path_ext_swapped = roc_curve_path_ext_swapped args.roc_curve_path_ext_regular = roc_curve_path_ext_regular # handle cuda usage args.use_cuda = args.yes_cuda > 0 and torch.cuda.is_available() device = torch.device("cuda" if args.use_cuda else "cpu") # set a seed to ensure deterministic start torch.manual_seed(args.seed) if args.use_cuda: torch.cuda.manual_seed(args.seed) # print type of execution print('CUDA device_count {0}'.format(torch.cuda.device_count()) if args.use_cuda else 'CPU') # to get the right dataset if args.task_type == "nsp": train, val, test, TEXT, LABELS = get_nsp_dataset(args) elif args.task_type == "snli": train, val, test, TEXT, LABELS = get_snli_dataset(args) # create batches: train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_sizes=(args.batch_size, args.batch_size, args.batch_size), sort_key=lambda x: len(x.premise), device=device, repeat=False) TEXT.build_vocab(train, vectors=GloVe(name='840B', dim=args.embedding_dim)) LABELS.build_vocab(train) print('#examples', len(train_iter.dataset), len(val_iter.dataset), len(test_iter.dataset)) model = LSTM_for_SNLI(args, TEXT, LABELS).to(device) optimizer = optim.Adam(model.req_grad_params, lr=args.lr, betas=(0.9, 0.999), amsgrad=True) loss_func = nn.CrossEntropyLoss().to(device) best_loss = float('inf') best_valid_acc = float('-inf') best_acc = 0. best_epoch = 0 test_losses = [] test_accuracies = [] valid_losses = [] valid_accuracies = [] train_losses = [] train_accuracies = [] is_last_one = False for epoch in range(1, args.epochs + 1): train_loss, train_acc = train_epoch(device, train_iter, model, epoch, optimizer, loss_func, args) train_losses.append( train_loss) train_accuracies.append( train_acc) valid_loss, valid_acc = evaluate_epoch(device, val_iter, model, epoch, loss_func, 'Valid', args) valid_losses.append(valid_loss) valid_accuracies.append(valid_acc) if valid_acc >= best_valid_acc: patience_counter = 0 best_valid_acc = valid_acc else: patience_counter += 1 if valid_loss < best_loss: best_loss = valid_loss best_acc = valid_acc best_epoch = epoch print('\tLowest Valid Loss {:.6f}, Acc. {:.1f}%, Epoch {}'. format(best_loss, 100 * best_acc, best_epoch)) if patience_counter > args.patience: is_last_one = True iter_test_loss, iter_test_accuracy = evaluate_epoch(device, test_iter, model, epoch, loss_func, 'Test', args, finish=is_last_one) test_losses.append(iter_test_loss) test_accuracies.append(iter_test_accuracy) # forced finish in case of overfitting if patience_counter > args.patience: print('Training terminated: PATIENCE exceeded') break # learning rate decay for param_group in optimizer.param_groups: print('lr: {:.6f} -> {:.6f}' .format(param_group['lr'], param_group['lr'] * args.lr_decay)) param_group['lr'] *= args.lr_decay # draw_results draw_learning_curve(train_accuracies, valid_accuracies, path=learning_curve_path) # works with torchtext bigger than 0.4.0 amd on dtu server there is 0.3.1 # ask admins? # Save model #if args.save_model: # torch.save(model, model_path) # print('Model saved: ', str(model_path)) # external dataset evaluation: if args.eed == True: evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.eed_regular, ) test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], [] ### Evaluate test set model.eval() for batch_idx, batch in enumerate(evaluation_iter): output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1]) preds = torch.max(output, 1)[1] if (args.use_cuda): raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_targs += list(get_numpy(batch.label)) test_preds += list(get_numpy(preds.data)) else: raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_preds += list(preds.data.numpy()) test_targs += list(batch.label.numpy()) test_accuracy = accuracy_score(test_targs, test_preds) print("\nEvaluation set Acc: %f" % (test_accuracy)) print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset))) y_test_preds = np.array(raw_outputs_class_two) y_test_targs = np.array(test_targs) draw_roc_curve(y_test_preds, y_test_targs, path=args.roc_curve_path_ext_regular) if args.eed == True: evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.eed_swapped) test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], [] ### Evaluate test set model.eval() for batch_idx, batch in enumerate(evaluation_iter): output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1]) preds = torch.max(output, 1)[1] if (args.use_cuda): raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_targs += list(get_numpy(batch.label)) test_preds += list(get_numpy(preds.data)) else: raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_preds += list(preds.data.numpy()) test_targs += list(batch.label.numpy()) test_accuracy = accuracy_score(test_targs, test_preds) print("\nEvaluation set Acc: %f" % (test_accuracy)) print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset))) y_test_preds = np.array(raw_outputs_class_two) y_test_targs = np.array(test_targs) draw_roc_curve(y_test_preds, y_test_targs, path=args.roc_curve_path_ext_swapped ) if args.test_con == True: evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.con_data_path) test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], [] ### Evaluate test set model.eval() for batch_idx, batch in enumerate(evaluation_iter): output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1]) preds = torch.max(output, 1)[1] if (args.use_cuda): raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_targs += list(get_numpy(batch.label)) test_preds += list(get_numpy(preds.data)) else: raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_preds += list(preds.data.numpy()) test_targs += list(batch.label.numpy()) test_accuracy = accuracy_score(test_targs, test_preds) print("\nEvaluation of Consecutive data set Acc: %f" % (test_accuracy)) print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset))) if args.test_rand == True: evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.rand_data_path) test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], [] ### Evaluate test set model.eval() for batch_idx, batch in enumerate(evaluation_iter): output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1]) preds = torch.max(output, 1)[1] if (args.use_cuda): raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_targs += list(get_numpy(batch.label)) test_preds += list(get_numpy(preds.data)) else: raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_preds += list(preds.data.numpy()) test_targs += list(batch.label.numpy()) test_accuracy = accuracy_score(test_targs, test_preds) print("\nEvaluation of Random data set Acc: %f" % (test_accuracy)) print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset))) if args.test_swap == True: evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.swap_data_path) test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], [] ### Evaluate test set model.eval() for batch_idx, batch in enumerate(evaluation_iter): output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1]) preds = torch.max(output, 1)[1] if (args.use_cuda): raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_targs += list(get_numpy(batch.label)) test_preds += list(get_numpy(preds.data)) else: raw_outputs_class_one += list(get_numpy(output[:, 0])) raw_outputs_class_two += list(get_numpy(output[:, 1])) test_preds += list(preds.data.numpy()) test_targs += list(batch.label.numpy()) test_accuracy = accuracy_score(test_targs, test_preds) print("\nEvaluation of Swapped data set Acc: %f" % (test_accuracy)) print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset)))
batch_first=True, stop_words=set(stopwords.words('english'))) #preprocessing=lambda x: [porter.stem(word) for word in x]) LABEL = Field(sequential=True, lower=True, use_vocab=True, is_target=True, unk_token=None, pad_token=None, batch_first=True) # make splits for data train, test = datasets.IMDB.splits(TEXT, LABEL) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train) # make iterator for splits train_iter, test_iter = BucketIterator.splits((train, test), batch_sizes=(64, 64), device=device, sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False, shuffle=True) ''' Define model ''' model = LSTM(vocab_size=len(TEXT.vocab.stoi), embed_size=300, hidden_dim=400, batch_size=64,
"mightn", "more", "most", "my", "myself", "needn", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "she", "she's", "should", "should've", "shouldn", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "we", "were", "weren", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "wouldn", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would" ] wordVectors = GloVe(name='6B', dim=300) ########################################################################### ##### The following determines the processing of label data (ratings) ##### ########################################################################### def convertLabel(datasetLabel): """ Labels (product ratings) from the dataset are provided to you as floats, taking the values 1.0, 2.0, 3.0, 4.0, or 5.0. You may wish to train with these as they are, or you you may wish to convert them to another representation in this function. Consider regression vs classification. """ # label = datasetLabel.view((1,-1))
def load_dataset(batch_size, test_sen=None): office_actions = pd.read_csv('/mnt/data/training-patent-data4144f61d-a15b-421e-9346-659741ee1c22/office_actions.csv', usecols=['app_id', 'ifw_number', 'rejection_102', 'rejection_103'], nrows=100000) abstractList = [] idList = [] rejectionColumn = [] for num in range(10000): app_id = str(office_actions.app_id[num]) filename = "/mnt/data/training-patent-data4144f61d-a15b-421e-9346-659741ee1c22/json_files_1/oa_"+app_id+".json" try: jfile = open(filename, 'r') except FileNotFoundError: print("File Not Found") continue parsed_json = json.load(jfile) jfile.close() try: abstractList.append(parsed_json[0]['abstract_full']) idList.append(parsed_json[0]['application_number']) except IndexError: print("WARNING: file "+filename+" is empty!\n") continue n = int(office_actions.rejection_102[num]) o = int(office_actions.rejection_103[num]) if n == 0 and o == 0: rejType = 0 #neither elif n == 0 and o == 1: rejType = 1 #obvious elif n == 1 and o == 0: rejType = 0 #novelty elif n == 1 and o == 1: rejType = 1 #both else: print("Office action error:", sys.exc_info()[0]) raise rejectionColumn.append(rejType) all_data = {'text': abstractList, 'label': rejectionColumn} df = pd.DataFrame(all_data, index = idList) tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = LabelField(sequential=False) #fields={'Abstract': ('text', TEXT), 'RejectionType': ('labels', LABEL)} fields={'text': TEXT, 'label': LABEL} ds = DataFrameDataset(df, fields) TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(ds) train_data, test_data = ds.split() train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data word_embeddings = TEXT.vocab.vectors print ("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print ("Label Length: " + str(len(LABEL.vocab))) train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter