def train(self, train_corpus): # Turn on training mode which enables dropout. self.model.train() # Splitting the data in batches train_batches = helper.batchify(train_corpus.data, self.config.batch_size) print('number of train batches = ', len(train_batches)) start = time.time() print_acc_total = 0 plot_acc_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors( train_batches[batch_no - 1], self.dictionary) if self.config.cuda and torch.cuda.is_available(): train_sentences1 = train_sentences1.cuda() train_sentences2 = train_sentences2.cuda() train_labels = train_labels.cuda() assert train_sentences1.size(0) == train_sentences2.size(0) score = self.model(train_sentences1, sent_len1, train_sentences2, sent_len2) n_correct = (torch.max(score, 1)[1].view( train_labels.size()).data == train_labels.data).sum() loss = self.criterion(score, train_labels) # Important if we are using nn.DataParallel() if loss.size(0) > 1: loss = loss.mean() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm( filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm) self.optimizer.step() print_acc_total += 100. * n_correct / len( train_batches[batch_no - 1]) plot_acc_total += 100. * n_correct / len( train_batches[batch_no - 1]) if batch_no % self.config.print_every == 0: print_acc_avg = print_acc_total / self.config.print_every print_acc_total = 0 print('%s (%d %d%%) %.2f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_acc_avg)) if batch_no % self.config.plot_every == 0: plot_acc_avg = plot_acc_total / self.config.plot_every self.train_accuracies.append(plot_acc_avg) plot_acc_total = 0
def train(self): # Turn on training mode which enables dropout. self.model.train() # Splitting the data in batches batches, batch_labels = [], [] for task_name, task in self.train_corpus.items(): train_batches = helper.batchify(task.data, self.config.batch_size) batches.extend(train_batches) batch_labels.extend([task_name] * len(train_batches)) combined = list(zip(batches, batch_labels)) numpy.random.shuffle(combined) batches[:], batch_labels[:] = zip(*combined) print('number of train batches = ', len(batches)) start = time.time() print_acc_total = 0 plot_acc_total = 0 num_back = 0 num_batches = len(batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() if self.config.use_elmo: train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_elmo_tensors( batches[batch_no - 1], self.dictionary) else: train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors( batches[batch_no - 1], self.dictionary) if self.config.cuda: train_sentences1 = train_sentences1.cuda() train_sentences2 = train_sentences2.cuda() train_labels = train_labels.cuda() assert train_sentences1.size(0) == train_sentences2.size(0) score = self.model(train_sentences1, sent_len1, train_sentences2, sent_len2, batch_labels[batch_no - 1]) n_correct = (torch.max(score, 1)[1].view(train_labels.size()).data == train_labels.data).sum() loss = self.criterion(score, train_labels) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm(filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm) self.optimizer.step() print_acc_total += 100. * n_correct / len(batches[batch_no - 1]) plot_acc_total += 100. * n_correct / len(batches[batch_no - 1]) if batch_no % self.config.print_every == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = '%s (%d %d%%) %.2f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_acc_total / batch_no) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info)
def train(self, train_corpus): # Turn on training mode which enables dropout. self.model.train() # splitting the data in batches train_batches = helper.batchify(train_corpus.data, self.config.batch_size) print('number of train batches = ', len(train_batches)) start = time.time() print_loss_total = 0 plot_loss_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() session_queries, session_query_length, rel_docs, rel_docs_length, doc_labels = helper.session_to_tensor( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: # batch_size x session_length x max_query_length session_queries = session_queries.cuda() # batch_size x session_length session_query_length = session_query_length.cuda() # batch_size x session_length x num_rel_docs_per_query x max_doc_length rel_docs = rel_docs.cuda() # batch_size x session_length x num_rel_docs_per_query rel_docs_length = rel_docs_length.cuda() # batch_size x session_length x num_rel_docs_per_query doc_labels = doc_labels.cuda() loss = self.model(session_queries, session_query_length, rel_docs, rel_docs_length, doc_labels) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm( filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm) self.optimizer.step() print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if batch_no % self.config.print_every == 0: print_loss_avg = print_loss_total / self.config.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_loss_avg)) if batch_no % self.config.plot_every == 0: plot_loss_avg = plot_loss_total / self.config.plot_every self.train_losses.append(plot_loss_avg) plot_loss_total = 0
def train(self, train_corpus): # Turn on training mode which enables dropout. self.model.train() # splitting the data in batches train_batches = helper.batchify(train_corpus.data, self.config.batch_size) print('number of train batches = ', len(train_batches)) start = time.time() print_loss_total = 0 plot_loss_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_queries, query_len, train_clicks, doc_len, click_labels = helper.batch_to_tensor( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: # batch_size x max_query_length train_queries = train_queries.cuda() # batch_size x num_clicks_per_query x max_document_length train_clicks = train_clicks.cuda() # batch_size x num_clicks_per_query click_labels = click_labels.cuda() score = self.model(train_queries, query_len, train_clicks, doc_len) # loss = self.compute_loss(score, click_labels) loss = f.binary_cross_entropy_with_logits(score, click_labels) # Important if we are using nn.DataParallel() if loss.size(0) > 1: loss = loss.mean() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm(filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm) self.optimizer.step() print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if batch_no % self.config.print_every == 0: print_loss_avg = print_loss_total / self.config.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % ( helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_loss_avg)) if batch_no % self.config.plot_every == 0: plot_loss_avg = plot_loss_total / self.config.plot_every self.train_losses.append(plot_loss_avg) plot_loss_total = 0
def train(self, train_corpus): # Turn on training mode which enables dropout. self.model.train() # splitting the data in batches train_batches = helper.batchify(train_corpus.data, self.config.batch_size) print('number of train batches = ', len(train_batches)) start = time.time() print_loss_total = 0 plot_loss_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() videos, video_len, descriptions, des_len = helper.videos_to_tensor( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: videos = videos.cuda( ) # batch_size x max_images_per_video x num_image_features descriptions = descriptions.cuda( ) # batch_size x max_description_length des_len = des_len.cuda() # batch_size loss = self.model(videos, video_len, descriptions, des_len) # Important if we are using nn.DataParallel() if loss.size(0) > 1: loss = loss.mean() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm( filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm) self.optimizer.step() print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if batch_no % self.config.print_every == 0: print_loss_avg = print_loss_total / self.config.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_loss_avg)) if batch_no % self.config.plot_every == 0: plot_loss_avg = plot_loss_total / self.config.plot_every self.train_losses.append(plot_loss_avg) plot_loss_total = 0
def train(self, train_corpus): # Turn on training mode which enables dropout. self.model.train() # splitting the data in batches train_batches = helper.batchify(train_corpus.data, self.config.batch_size) print('number of train batches = ', len(train_batches)) start = time.time() print_loss_total = 0 plot_loss_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_sessions, length, train_clicks, click_labels = helper.session_to_tensor( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: # batch_size x session_length x max_query_length train_sessions = train_sessions.cuda() # batch_size x session_length x num_clicks_per_query x max_document_length train_clicks = train_clicks.cuda() # batch_size x session_length length = length.cuda() # batch_size x session_length x num_clicks_per_query click_labels = click_labels.cuda() loss = self.model(train_sessions, length, train_clicks, click_labels) # Important if we are using nn.DataParallel() if loss.size(0) > 1: loss = loss.mean() loss.backward() self.optimizer.step() print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if batch_no % self.config.print_every == 0: print_loss_avg = print_loss_total / self.config.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_loss_avg)) if batch_no % self.config.plot_every == 0: plot_loss_avg = plot_loss_total / self.config.plot_every self.train_losses.append(plot_loss_avg) plot_loss_total = 0
def train(self, train_corpus): # Turn on training mode which enables dropout. self.model.train() # splitting the data in batches train_batches = helper.batchify(train_corpus.data, self.config.batch_size) print('number of train batches = ', len(train_batches)) start = time.time() print_loss_total = 0 plot_loss_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_queries, train_docs, click_labels = helper.batch_to_tensor( train_batches[batch_no - 1], self.dictionary, self.config.max_query_length, self.config.max_doc_length) if self.config.cuda: # batch_size x max_query_length x vocab_size train_queries = train_queries.cuda() # batch_size x x num_rel_docs_per_query x max_doc_length x vocab_size train_docs = train_docs.cuda() # batch_size x num_rel_docs_per_query click_labels = click_labels.cuda() softmax_prob = self.model(train_queries, train_docs) loss = self.compute_loss(softmax_prob, click_labels) loss.backward() self.optimizer.step() print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if batch_no % self.config.print_every == 0: print_loss_avg = print_loss_total / self.config.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_loss_avg)) if batch_no % self.config.plot_every == 0: plot_loss_avg = plot_loss_total / self.config.plot_every self.train_losses.append(plot_loss_avg) plot_loss_total = 0
def validate(self, dev_corpus): # Turn on evaluation mode which disables dropout. self.model.eval() print_every = self.config.print_every start = time.time() dev_batches = helper.batchify(dev_corpus.data, self.config.batch_size) print('number of dev batches = ', len(dev_batches)) num_batches = len(dev_batches) n_correct, n_total = 0, 0 for batch_no in range(1, num_batches + 1): dev_sentences1, sent_len1, dev_sentences2, sent_len2, dev_labels = helper.batch_to_tensors( dev_batches[batch_no - 1], self.dictionary, True) if self.config.cuda: dev_sentences1 = dev_sentences1.cuda() dev_sentences2 = dev_sentences2.cuda() dev_labels = dev_labels.cuda() assert dev_sentences1.size(0) == dev_sentences2.size(0) score = self.model(dev_sentences1, sent_len1, dev_sentences2, sent_len2) n_correct += (torch.max(score, 1)[1].view( dev_labels.size()).data == dev_labels.data).sum() n_total += len(dev_batches[batch_no - 1]) print_acc = 100. * n_correct / n_total if batch_no % print_every == 0 or self.config.debug: p = 100.0 print('%s (%d %d%%) (%.2f) %.2f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, p, print_acc)) return 100. * n_correct / n_total
def train(self, train_corpus): # Turn on training mode which enables dropout. self.model.train() # Splitting the data in batches train_batches = helper.batchify(train_corpus.data, self.config.batch_size) print('number of train batches = ', len(train_batches)) start = time.time() print_acc_total = 0 plot_acc_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: train_sentences1 = train_sentences1.cuda() train_sentences2 = train_sentences2.cuda() train_labels = train_labels.cuda() assert train_sentences1.size(0) == train_sentences2.size(0) score = self.model(train_sentences1, sent_len1, train_sentences2, sent_len2) n_correct = (torch.max(score, 1)[1].view(train_labels.size()).data == train_labels.data).sum() loss = self.criterion(score, train_labels) # Important if we are using nn.DataParallel() if loss.size(0) > 1: loss = loss.mean() loss.backward() # gradient clipping (off by default) shrink_factor = 1 total_norm = 0 for p in self.model.parameters(): if p.requires_grad: p.grad.data.div_(train_sentences1.size(0)) # divide by the actual batch size total_norm += p.grad.data.norm() ** 2 total_norm = numpy.sqrt(total_norm) if total_norm > self.config.clip: shrink_factor = self.config.clip / total_norm current_lr = self.optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam) self.optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update self.optimizer.step() self.optimizer.param_groups[0]['lr'] = current_lr print_acc_total += 100. * n_correct / len(train_batches[batch_no - 1]) plot_acc_total += 100. * n_correct / len(train_batches[batch_no - 1]) if batch_no % self.config.print_every == 0: print_acc_avg = print_acc_total / self.config.print_every print_acc_total = 0 print('%s (%d %d%%) %.2f' % ( helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_acc_avg)) if batch_no % self.config.plot_every == 0: plot_acc_avg = plot_acc_total / self.config.plot_every self.train_accuracies.append(plot_acc_avg) plot_acc_total = 0
def evaluate(model, batches, dictionary, outfile=None, selection_time=0.9318): #selection_time=0.9318 for IMDB by budget model # Turn on evaluation mode which disables dropout. model.eval() n_correct, n_total = 0, 0 y_preds, y_true, output = [], [], [] start = time.time() num_batches = len(batches) num_tokens_padded = 0 selection_time = 0 selected_tokens = 0 for batch_no in range(len(batches)): test_sentences1, sent_len1, test_sentences2, sent_len2, test_labels = helper.batch_to_tensors(batches[batch_no], dictionary, True) if args.cuda: test_sentences1 = test_sentences1.cuda() test_sentences2 = test_sentences2.cuda() test_labels = test_labels.cuda() assert test_sentences1.size(0) == test_sentences1.size(0) selected_tokens+= sum(sent_len1)+sum(sent_len2) num_tokens_padded += 2*(force_min_sen_len*args.eval_batch_size) score = model(test_sentences1, sent_len1, test_sentences2, sent_len2) preds = torch.max(score, 1)[1] if outfile: predictions = preds.data.cpu().tolist() for i in range(len(batches[batch_no])): output.append([batches[batch_no][i].id, predictions[i]]) else: y_preds.extend(preds.data.cpu().tolist()) y_true.extend(test_labels.data.cpu().tolist()) n_correct += (preds.view(test_labels.size()).data == test_labels.data).sum() n_total += len(batches[batch_no]) if (batch_no+1) % args.print_every == 0: padded_p = 100.0 * selected_tokens/num_tokens_padded print_acc_avg = 100. * n_correct / n_total print('%s (%d %d%%) (padded %.2f) %.2f' % ( helper.show_progress(start, (batch_no+1) / num_batches), (batch_no+1), (batch_no+1) / num_batches * 100, padded_p, print_acc_avg)) now = time.time() s = now - start estimated_full_text_padded_time = (s ) * num_tokens_padded / selected_tokens s+=selection_time print('estimated full text time padded = %s'% (helper.convert_to_minutes(estimated_full_text_padded_time))) padded_p = 100.0 * selected_tokens/num_tokens_padded padded_speed_up = 1.0*estimated_full_text_padded_time/s print_acc_avg = 100. * n_correct / n_total print('total: %s (%d %d%%)(padded %.2f) %.2f' % ( helper.show_progress(start, (batch_no+1) / num_batches), (batch_no+1), (batch_no+1) / num_batches * 100, padded_p, print_acc_avg)) print('estimated padded speed up = %0.2f, selection text percentage spped up padded = %0.2f' % (padded_speed_up, 100.0/padded_p )) if outfile: target_names = ['entailment', 'neutral', 'contradiction'] with open(outfile, 'w') as f: f.write('pairID,gold_label' + '\n') for item in output: f.write(str(item[0]) + ',' + target_names[item[1]] + '\n') else: return 100. * n_correct / n_total, 100. * f1_score(numpy.asarray(y_true), numpy.asarray(y_preds), average='weighted'), s
def train(self, train_corpus, epoch): # Turn on training mode which enables dropout. self.model.train() # Splitting the data in batches shuffle = True # if self.config.task == 'sst': shuffle = False print(shuffle) train_batches = helper.batchify(train_corpus.data, self.config.batch_size, shuffle) print('number of train batches = ', len(train_batches)) start = time.time() print_acc_total = 0 plot_acc_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: train_sentences1 = train_sentences1.cuda() train_sentences2 = train_sentences2.cuda() train_labels = train_labels.cuda() assert train_sentences1.size(0) == train_sentences2.size(0) score = self.model(train_sentences1, sent_len1, train_sentences2, sent_len2) n_correct = (torch.max(score, 1)[1].view( train_labels.size()).data == train_labels.data).sum() # print (' score size ', score.size(), train_labels.size()) loss = self.criterion(score, train_labels) ############################ custom new_loss ############################ # z2 = z_pred.dimshuffle((0,1,"x")) # logpz = - T.nnet.binary_crossentropy(probs, z2) * masks # logpz = self.logpz = logpz.reshape(x.shape) # probs = self.probs = probs.reshape(x.shape) # # batch # z = z_pred # self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) # self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX) # zsum = generator.zsum # zdiff = generator.zdiff # logpz = generator.logpz # coherent_factor = args.sparsity * args.coherent # loss = self.loss = T.mean(loss_vec) #this is not needed as in cost_vec loss_vec is used # sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ # T.mean(zdiff) * coherent_factor # cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor # cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) # self.obj = T.mean(cost_vec) ############################ custom new_loss ############################ if loss.size(0) > 1: loss = loss.mean() # print ('loss:', loss) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. grad_norm = clip_grad_norm( filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm) # if epoch==11: # print(batch_no, grad_norm) self.optimizer.step() print_acc_total += 100. * n_correct / len( train_batches[batch_no - 1]) plot_acc_total += 100. * n_correct / len( train_batches[batch_no - 1]) if batch_no % self.config.print_every == 0: print_acc_avg = print_acc_total / self.config.print_every print_acc_total = 0 print('%s (%d %d%%) %.2f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_acc_avg)) if batch_no % self.config.plot_every == 0: plot_acc_avg = plot_acc_total / self.config.plot_every self.train_accuracies.append(plot_acc_avg) plot_acc_total = 0
def train(self): # Turn on training mode which enables dropout. self.generator.train() # Splitting the data in batches batches, batch_labels = [], [] for task_name, task in self.train_corpus.items(): train_batches = helper.batchify(task.data, self.config.batch_size) batches.extend(train_batches) batch_labels.extend([task_name] * len(train_batches)) combined = list(zip(batches, batch_labels)) numpy.random.shuffle(combined) batches[:], batch_labels[:] = zip(*combined) print('number of train batches = ', len(batches)) start = time.time() num_back, print_acc_total, plot_acc_total = 0, 0, 0 num_batches = len(batches) for batch_no in range(1, num_batches + 1): if self.config.use_elmo: train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_elmo_input( batches[batch_no - 1], self.dictionary) else: train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors( batches[batch_no - 1], self.dictionary) if self.config.cuda: train_sentences1 = train_sentences1.cuda() train_sentences2 = train_sentences2.cuda() train_labels = train_labels.cuda() assert train_sentences1.size(0) == train_sentences2.size(0) if self.config.adversarial: self.optimizerD.zero_grad() scores, diff_loss, shared_rep = self.generator( train_sentences1, sent_len1, train_sentences2, sent_len2, batch_labels[batch_no - 1]) n_correct = (torch.max(scores, 1)[1].view( train_labels.size()).data == train_labels.data).sum() shared_sent_rep1 = shared_rep[0] shared_sent_rep2 = shared_rep[1] # runt the discriminator to distinguish tasks task_prob1 = self.discriminator( shared_sent_rep1.detach()) # B X num_tasks task_prob2 = self.discriminator( shared_sent_rep2.detach()) # B X num_tasks comb_prob = torch.cat((task_prob1, task_prob2), 0) # 2B X num_tasks task_prob = torch.sum(comb_prob, 0).squeeze() # size = |num_tasks| adv_loss = -1 * task_prob[self.task_ids[batch_labels[batch_no - 1]]] adv_loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm( filter(lambda p: p.requires_grad, self.discriminator.parameters()), self.config.max_norm) self.optimizerD.step() self.optimizerG.zero_grad() cross_entropy_loss = self.criterion(scores, train_labels) # runt the discriminator to distinguish tasks task_prob1 = self.discriminator( shared_sent_rep1) # B X num_tasks task_prob2 = self.discriminator( shared_sent_rep2) # B X num_tasks comb_prob = torch.cat((task_prob1, task_prob2), 0) # 2B X num_tasks task_prob = torch.sum(comb_prob, 0).squeeze() # size = |num_tasks| adv_loss = -1 * task_prob[self.task_ids[batch_labels[batch_no - 1]]] total_loss = cross_entropy_loss + self.config.beta * adv_loss + self.config.gamma * diff_loss # Important if we are using nn.DataParallel() if total_loss.size(0) > 1: total_loss = total_loss.mean() total_loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm( filter(lambda p: p.requires_grad, self.generator.parameters()), self.config.max_norm) self.optimizerG.step() else: self.optimizerG.zero_grad() scores = self.generator(train_sentences1, sent_len1, train_sentences2, sent_len2, batch_labels[batch_no - 1]) n_correct = (torch.max(scores, 1)[1].view( train_labels.size()).data == train_labels.data).sum() loss = self.criterion(scores, train_labels) # Important if we are using nn.DataParallel() if loss.size(0) > 1: loss = loss.mean() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm( filter(lambda p: p.requires_grad, self.generator.parameters()), self.config.max_norm) self.optimizerG.step() print_acc_total += 100. * n_correct / len(batches[batch_no - 1]) plot_acc_total += 100. * n_correct / len(batches[batch_no - 1]) if batch_no % self.config.print_every == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = '%s (%d %d%%) %.2f%%' % (helper.show_progress( start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_acc_total / batch_no) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) if batch_no % self.config.plot_every == 0: plot_acc_avg = plot_acc_total / self.config.plot_every self.train_accuracies.append(plot_acc_avg) plot_acc_total = 0 # this releases all cache memory and becomes visible to other applications torch.cuda.empty_cache()
def train(self, train_corpus, epoch): # Turn on training mode which enables dropout. self.model.train() # Splitting the data in batches shuffle = True # if self.config.task == 'sst': shuffle = False print(shuffle) train_batches = helper.batchify(train_corpus.data, self.config.batch_size, shuffle) print('number of train batches = ', len(train_batches)) start = time.time() print_acc_total = 0 plot_acc_total = 0 num_batches = len(train_batches) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: train_sentences1 = train_sentences1.cuda() train_sentences2 = train_sentences2.cuda() train_labels = train_labels.cuda() assert train_sentences1.size(0) == train_sentences2.size(0) # print(' train label size: ', train_labels.size(), ' train data size: ', train_sentences1.size()) # print(' labels: ', train_labels) score = self.model(train_sentences1) n_correct = (torch.max(score, 1)[1].view( train_labels.size()).data == train_labels.data).sum() # print (' score size ', score.size(), train_labels.size()) loss = self.criterion(score, train_labels) if loss.size(0) > 1: loss = loss.mean() # print ('loss:', loss) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. grad_norm = clip_grad_norm( filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm) # if epoch==11: # print(batch_no, grad_norm) self.optimizer.step() print_acc_total += 100. * n_correct / len( train_batches[batch_no - 1]) plot_acc_total += 100. * n_correct / len( train_batches[batch_no - 1]) if batch_no % self.config.print_every == 0: print_acc_avg = print_acc_total / self.config.print_every print_acc_total = 0 print('%s (%d %d%%) %.2f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_acc_avg)) if batch_no % self.config.plot_every == 0: plot_acc_avg = plot_acc_total / self.config.plot_every self.train_accuracies.append(plot_acc_avg) plot_acc_total = 0
def train(self, train_batches, dev_batches, epoch_no): # Turn on training mode which enables dropout. self.model.train() start = time.time() print_loss_total = 0 plot_loss_total = 0 num_batches = len(train_batches) print('epoch %d started' % epoch_no) for batch_no in range(1, num_batches + 1): # Clearing out all previous gradient computations. self.optimizer.zero_grad() train_sessions, length = helper.session_to_tensor( train_batches[batch_no - 1], self.dictionary) if self.config.cuda: train_sessions = train_sessions.cuda() length = length.cuda() loss = self.model(train_sessions, length) # Important if we are using nn.DataParallel() if loss.size(0) > 1: loss = torch.mean(loss) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. clip_grad_norm(self.model.parameters(), self.config.clip) self.optimizer.step() print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if batch_no % self.config.print_every == 0: print_loss_avg = print_loss_total / self.config.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_loss_avg)) if batch_no % self.config.plot_every == 0: plot_loss_avg = plot_loss_total / self.config.plot_every self.train_losses.append(plot_loss_avg) plot_loss_total = 0 if batch_no % self.config.dev_every == 0: dev_loss = self.validate(dev_batches) self.dev_losses.append(dev_loss) print('validation loss = %.4f' % dev_loss) if self.best_dev_loss == -1 or self.best_dev_loss > dev_loss: self.best_dev_loss = dev_loss helper.save_checkpoint( { 'epoch': epoch_no, 'state_dict': self.model.state_dict(), 'best_loss': self.best_dev_loss, 'optimizer': self.optimizer.state_dict(), }, self.config.save_path + 'model_best.pth.tar') else: self.times_no_improvement += 1 # no improvement in validation loss for last n times, so stop training if self.times_no_improvement == 20: self.stop = True break
def train(self, train_dataset): batches_idx = helper.get_batches_idx(len(train_dataset), self.args.batch_size) print('number of train batches = ', len(batches_idx)) start = time.time() print_loss_total = 0 plot_loss_total = 0 num_batches = len(batches_idx) for batch_no in range(1, num_batches + 1): #1,...num_batches batch_idx = batches_idx[batch_no - 1] batch_data = [train_dataset.dataset[i] for i in batch_idx] #将一批数据转换为模型输入的格式 (hist_query_input, hist_doc_input, session_num, hist_query_num, hist_query_len, hist_click_num, hist_doc_len, cur_query_input, cur_doc_input, cur_query_num, cur_query_len, cur_click_num, cur_doc_len, query, q_len, doc, d_len, y, next_q, next_q_len, _) = helper.batch_to_tensor(batch_data, self.args.max_query_len, self.args.max_doc_len) indices, slots_num = self.model.get_memory_input(session_num) feed_dict = { self.model.hist_query_input: hist_query_input, self.model.hist_doc_input: hist_doc_input, self.model.session_num: session_num, self.model.hist_query_num: hist_query_num, self.model.hist_query_len: hist_query_len, self.model.hist_click_num: hist_click_num, self.model.hist_doc_len: hist_doc_len, self.model.cur_query_input: cur_query_input, self.model.cur_doc_input: cur_doc_input, self.model.cur_query_num: cur_query_num, self.model.cur_query_len: cur_query_len, self.model.cur_click_num: cur_click_num, self.model.cur_doc_len: cur_doc_len, self.model.q: query, self.model.q_len: q_len, self.model.d: doc, self.model.d_len: d_len, self.model.y: y, # 0/1 self.model.indices: indices, self.model.slots_num: slots_num, self.model.next_q: next_q, self.model.next_q_len: next_q_len } #计算loss + 优化参数 loss_ = self.sess.run(self.model.loss, feed_dict=feed_dict) train_op_ = self.sess.run(self.model.train_op, feed_dict=feed_dict) print_loss_total += loss_ plot_loss_total += loss_ if batch_no % self.args.print_every == 0: print_loss_avg = print_loss_total / self.args.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (helper.show_progress(start, batch_no / num_batches), batch_no, batch_no / num_batches * 100, print_loss_avg)) if batch_no % self.args.plot_every == 0: plot_loss_avg = plot_loss_total / self.args.plot_every self.train_losses.append(plot_loss_avg) plot_loss_total = 0