def forward(self, x, attention_mask=None, labels=None, labels_normal=None, lm_labels=None, labels_sent=None, labels_op=None): h_ae = self.fc_ae_1(x) # Eq 4 h_op = self.fc_op_1(x) # Eq 5 # AE and OE auxiliary tasks o_ae = self.fc_ae(F.relu(h_ae)) o_op = self.fc_op(F.relu(h_op)) p_ae = self.softmax(o_ae) # Eq 6 p_op = self.softmax(o_op) # Eq 7 # B: 1, O: 2, Find probability of a word being part of an aspect term p_ae = p_ae[:, :, 1] + p_ae[:, :, 2] # (bsz, seq_len) p_ae = p_ae.unsqueeze(1) # (bsz, 1, seq_len) # Find probability of a word being part of an opinion term p_op = p_op[:, :, 1] + p_op[:, :, 2] # (bsz, seq_len) p_op = p_op.unsqueeze(1) # (bsz, 1, seq_len) seq_len = x.size()[1] # N zero_diag = -1e18 * torch.eye( seq_len, seq_len, requires_grad=False).to(self.config.device) idxs = torch.arange(0, seq_len, requires_grad=False).to(self.config.device) idxs = idxs.unsqueeze(1) # (seq_len, 1) tmp = idxs * torch.ones(seq_len, seq_len, requires_grad=False).to( self.config.device) # (seq_len, seq_len) dist_metric = torch.abs(tmp - tmp.transpose(0, 1)) + torch.eye( seq_len, seq_len, requires_grad=False).to( self.config.device) # (seq_len, seq_len) dist_metric = 1 / dist_metric A = h_ae @ self.W @ h_op.transpose(1, 2) # bsz, seq_len, seq_len A = A + zero_diag # (bsz, seq_len, seq_len) # Score matrix Q, Eq 8 A = A * dist_metric op_prime = self.softmax(A * p_op) @ h_op # Eq 9 + 11 ae_prime = self.softmax(A.transpose(1, 2) * p_ae) @ h_ae # Eq 10 + 12 c = torch.cat([h_ae, ae_prime, h_op, op_prime], dim=2) # (bsz, seq_len, 4 * h), Eq 13 o_prime = self.fc(c) # Eq 14 # Loss computations loss = 0 active_loss = attention_mask.view(-1) == 1 # Aspect tag predictions (AE) active_logits = o_ae.view(-1, self.config.num_normal_labels)[active_loss] active_labels = labels_normal.view(-1)[active_loss] loss += self.loss_weight * nn.MultiMarginLoss(margin=1)(active_logits, active_labels) # Opinion tag predictions (OE) active_logits = o_op.view(-1, self.config.num_normal_labels)[active_loss] active_labels = labels_op.view(-1)[active_loss] loss += self.loss_weight * nn.MultiMarginLoss(margin=1)(active_logits, active_labels) # Unified tag predictions (U) active_logits = o_prime.view(-1, self.config.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss += nn.MultiMarginLoss(margin=3)(active_logits, active_labels) return loss, o_prime
]) loss = nn.ModuleDict([ ['l1', nn.L1Loss()], ['nll', nn.NLLLoss()], ['kldiv', nn.KLDivLoss()], ['mse', nn.MSELoss()], ['bce', nn.BCELoss()], ['bce_with_logits', nn.BCEWithLogitsLoss()], ['cosine_embedding', nn.CosineEmbeddingLoss()], ['ctc', nn.CTCLoss()], ['hinge_embedding', nn.HingeEmbeddingLoss()], ['margin_ranking', nn.MarginRankingLoss()], ['multi_label_margin', nn.MultiLabelMarginLoss()], ['multi_label_soft_margin', nn.MultiLabelSoftMarginLoss()], ['multi_margin', nn.MultiMarginLoss()], ['smooth_l1', nn.SmoothL1Loss()], ['soft_margin', nn.SoftMarginLoss()], ['cross_entropy', nn.CrossEntropyLoss()], ['triplet_margin', nn.TripletMarginLoss()], ['poisson_nll', nn.PoissonNLLLoss()] ]) def _parse( identifier: typing.Union[str, typing.Type[nn.Module], nn.Module], dictionary: nn.ModuleDict, target: str ) -> nn.Module: """ Parse loss and activation.
def fine_tune(self, X, Y): self.log("==============================================================") self.log("Supervised learning with input " + str(X.shape)) self.log("batch_size = " + str(self.batch_size)) self.log("num_epochs = " + str(self.num_epochs)) self.log("init_lr = " + str(self.init_lr)) self.log("l2_regu_weight_decay = " + str(self.l2_regu_weight_decay)) self.log("lr_schedule_step_size = " + str(self.lr_schedule_step_size)) self.log("lr_schedule_gamma = " + str(self.lr_schedule_gamma)) self.log("use_class_weights = " + str(self.use_class_weights)) self.log("is_regr = " + str(self.is_regr)) self.log("--------------------------------------------------------------") start_time = datetime.now() # Loss function if self.is_regr: criterion = nn.MSELoss() #criterion = nn.SmoothL1Loss() if self.use_class_weights: self.log("Regression will ignore class weights") else: #criterion = nn.CrossEntropyLoss() criterion = nn.MultiMarginLoss() # Compute the weight of each class (because the dataset is imbalanced) if self.use_class_weights: class_weights = float(X.shape[0]) / (output_size * np.bincount(Y.squeeze())) class_weights = torch.FloatTensor(class_weights) if self.use_cuda: class_weights = class_weights.cuda() criterion = nn.CrossEntropyLoss(weight=class_weights) # Optimizer optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.init_lr, weight_decay=self.l2_regu_weight_decay) # Learning rate scheduler rule = lambda epoch: self.lr_schedule_gamma ** (epoch // self.lr_schedule_step_size) scheduler = LambdaLR(optimizer, lr_lambda=[rule]) # Save original training data self.train = {"X": deepcopy(X), "Y": deepcopy(Y)} # Break data into batches num_of_left_overs = self.batch_size - (X.shape[0] % self.batch_size) X = np.append(X, X[0:num_of_left_overs], 0) Y = np.append(Y, Y[0:num_of_left_overs], 0) num_of_batches = X.shape[0] // self.batch_size X = np.split(X, num_of_batches, 0) Y = np.split(Y, num_of_batches, 0) # Train the Model for epoch in range(1, self.num_epochs+1): X, Y = shuffle(X, Y) # shuffle batches loss_all = [] # for saving the loss in each step scheduler.step() # adjust learning rate # Loop through all batches for x, y in zip(X, Y): x = torch.FloatTensor(x) if self.is_regr: y = torch.FloatTensor(y) else: y = torch.LongTensor(y) if self.use_cuda: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) optimizer.zero_grad() # reset gradient outputs = self.model(x) # forward propagation loss = criterion(outputs, y) # compute loss loss.backward() # backward propagation optimizer.step() # optimize loss_all.append(loss.data[0]) # save loss for each step # Print the result for the entire epoch T_tr, P_tr = self.train["Y"], self.predict(self.train["X"]) m_train = computeMetric(T_tr, P_tr, self.is_regr, flatten=True, simple=True, aggr_axis=True) if self.test is not None: T_te, P_te = self.test["Y"], self.predict(self.test["X"]) m_test = computeMetric(T_te, P_te, self.is_regr, flatten=True, simple=True, aggr_axis=True) lr_now = optimizer.state_dict()["param_groups"][0]["lr"] avg_loss = np.mean(loss_all) if self.is_regr: if self.test is not None: self.log('[%2d/%d], LR: %.8f, Loss: %.8f, [mse, r2], [%2f, %2f], [%2f, %2f]' %(epoch, self.num_epochs, lr_now, avg_loss, m_train["mse"], m_train["r2"], m_test["mse"], m_test["r2"])) else: self.log('[%2d/%d], LR: %.8f, Loss: %.8f, [mse, r2], [%5d, %5d]' %(epoch, self.num_epochs, lr_now, avg_loss, m_train["mse"], m_train["r2"])) else: cm_names = " ".join(m_train["cm"][0]) cm_train = " ".join(map(lambda x: '%5d'%(x), m_train["cm"][1])) if self.test is not None: cm_test = " ".join(map(lambda x: '%4d'%(x), m_test["cm"][1])) self.log('[%2d/%d], LR: %.8f, Loss: %.8f, [%s], [%s], [%s]' %(epoch, self.num_epochs, lr_now, avg_loss, cm_names, cm_train, cm_test)) else: self.log('[%2d/%d], LR: %.9f, Loss: %.9f, [%s], [%s]' %(epoch, self.num_epochs, lr_now, avg_loss, cm_names, cm_train)) self.log("--------------------------------------------------------------") self.log("From " + str(start_time) + " to " + str(datetime.now())) self.log("--------------------------------------------------------------") return self
def train(): model = VGG11() if use_cuda: torch.cuda.set_device(gpu_id) model = model.cuda() data_transform = transforms.Compose([ transforms.RandomResizedCrop(224), # transforms.RandomHorizontalFlip(p=0.5), transforms.RandomRotation(15), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_loader, valid_loader = make_train_data_loader(train_data_path, data_transform) print('trainset len:', len(train_loader.dataset)) print('train loader len:', len(train_loader)) print('valid loader len:', len(valid_loader)) print('=========================================') # optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001) optimizer = torch.optim.SGD(params=model.parameters(), lr=0.001, momentum=0.9) SVMloss = nn.MultiMarginLoss() train_loss_list = [] valid_loss_list = [] train_accuracy_list =[] best_accuracy = 0.0 for epoch in range(num_epochs): print(f'Epoch: {epoch + 1}/{num_epochs}') print('-' * len(f'Epoch: {epoch + 1}/{num_epochs}')) train_loss = 0.0 valid_loss = 0.0 training_accuracy = 0.0 predict_correct = 0 for data, label in train_loader: if use_cuda: data, label = data.cuda(), label.cuda() optimizer.zero_grad() output = model(data) _, prediction = torch.max(output.data, 1) loss = SVMloss(output, label) loss.backward() optimizer.step() train_loss += loss.item() * data.size(0) predict_correct += torch.sum(prediction == label.data) model.eval() for data, label in valid_loader: if use_cuda: data, label = data.cuda(), label.cuda() output = model(data) loss = SVMloss(output, label) valid_loss += loss.item() * data.size(0) train_loss = train_loss / float(np.floor(len(train_loader.dataset) * (1 - valid_size))) train_loss_list.append(train_loss) valid_loss = valid_loss / float(np.floor(len(valid_loader.dataset) * valid_size)) valid_loss_list.append(valid_loss) training_accuracy = float(predict_correct) / float(len(train_loader.dataset)) train_accuracy_list.append(training_accuracy) print(f'Training loss: {train_loss:.4f}\nValidation loss: {valid_loss:.4f}\nAccuracy: {training_accuracy:.4f}') if training_accuracy > best_accuracy: best_accuracy = training_accuracy # torch.save(model.state_dict(), weight_path) best_weight = copy.deepcopy(model.state_dict()) print(f'Best accuracy update, current best weights saved') # print(f'best accuracy update: {best_accuracy:.4f}, current best weights saved') print('\n') # model.load_state_dict(best_weight) torch.save(best_weight, weight_path) print(f'best weight saved at {weight_path}') x1 = range(0,len(train_accuracy_list)) x2 = range(0,len(train_loss_list)) x3 = range(0,len(valid_loss_list)) y1 = train_accuracy_list y2 = train_loss_list y3 = valid_loss_list plt.subplots_adjust(left = 0.1, bottom = 0.2, right = 0.9, top = 0.9, wspace = 0.1, hspace = 0.9) plt.subplot(2, 1, 1) plt.plot(x1, y1, 'm', linestyle='-', label='Training accuracy') plt.xlabel(u'epoches') plt.ylabel(u'Accuracy') plt.xlim(0,len(train_accuracy_list)) plt.title('Train accuracy vs. epoches') plt.grid('on') plt.subplot(2, 1, 2) plt.plot(x2, y2, 'c', linestyle='-', label='train loss') plt.plot(x3, y3, 'y', linestyle='-', label='valid loss') plt.xlabel(u'epoches') plt.ylabel(u'Loss') plt.xlim(0,len(train_accuracy_list)) plt.title('Train loss vs. valid loss') plt.grid('on') plt.legend(loc=1) plt.savefig("accuracy_loss.png") plt.show()
h = self.dropout(h) # (batch_size * hidden_size) return h #%% hidden_size = 100 learning_rate = 1e-3 #another option is 3e-4 wd = 0 n_filter = 4 n_negative = 20 # number of negative questions #dropout = 0.1 # two more options 0.2 and 0.3 cnnmodel = CNN(hidden_size, n_filter) lossfunction = nn.MultiMarginLoss(p=1, margin=0.2) optimizer = optim.Adam(cnnmodel.parameters(), lr=learning_rate, weight_decay=wd) #%% import sys def cnntrain(query_embedding, positive_embedding, negative_embedding): n_batch = len(query_embedding) optimizer.zero_grad() similarity_matrix = Variable(torch.zeros(n_batch, n_negative + 1)) #query_vec = cnnmodel(Variable(torch.FloatTensor(query_embedding)))
def main(): global args, best_auc args = parser.parse_args() cuda_available = torch.cuda.is_available() print args embedding_file = 'data/glove/glove.pruned.txt.gz' embedding_iter = Embedding.iterator(embedding_file) embed_size = 300 embedding = Embedding(embed_size, embedding_iter) print 'Embeddings loaded.' android_corpus_file = 'data/android/corpus.tsv.gz' android_dataset = AndroidDataset(android_corpus_file) android_corpus = android_dataset.get_corpus() android_ids = embedding.corpus_to_ids(android_corpus) print 'Got Android corpus ids.' ubuntu_corpus_file = 'data/askubuntu/text_tokenized.txt.gz' ubuntu_dataset = UbuntuDataset(ubuntu_corpus_file) ubuntu_corpus = ubuntu_dataset.get_corpus() ubuntu_ids = embedding.corpus_to_ids(ubuntu_corpus) print 'Got AskUbuntu corpus ids.' padding_id = embedding.vocab_ids['<padding>'] ubuntu_train_file = 'data/askubuntu/train_random.txt' ubuntu_train_data = ubuntu_dataset.read_annotations(ubuntu_train_file) dev_pos_file = 'data/android/dev.pos.txt' dev_neg_file = 'data/android/dev.neg.txt' android_dev_data = android_dataset.read_annotations( dev_pos_file, dev_neg_file) android_dev_batches = batch_utils.generate_eval_batches( android_ids, android_dev_data, padding_id) assert args.model in ['lstm', 'cnn'] if args.model == 'lstm': model_encoder = LSTM(embed_size, args.hidden) else: model_encoder = CNN(embed_size, args.hidden) model_classifier = FFN(args.hidden) print model_encoder print model_classifier optimizer_encoder = torch.optim.Adam(model_encoder.parameters(), lr=args.elr) criterion_encoder = nn.MultiMarginLoss(margin=args.margin) optimizer_classifier = torch.optim.Adam(model_classifier.parameters(), lr=args.clr) criterion_classifier = nn.CrossEntropyLoss() if cuda_available: criterion_encoder = criterion_encoder.cuda() criterion_classifier = criterion_classifier.cuda() if args.load: if os.path.isfile(args.load): print 'Loading checkpoint.' checkpoint = torch.load(args.load) args.start_epoch = checkpoint['epoch'] best_auc = checkpoint.get('best_auc', -1) model_encoder.load_state_dict(checkpoint['encoder_state_dict']) model_classifier.load_state_dict( checkpoint['classifier_state_dict']) print 'Loaded checkpoint at epoch {}.'.format(checkpoint['epoch']) else: print 'No checkpoint found here.' if args.eval: test_pos_file = 'data/android/test.pos.txt' test_neg_file = 'data/android/test.neg.txt' android_test_data = android_dataset.read_annotations( test_pos_file, test_neg_file) android_test_batches = batch_utils.generate_eval_batches( android_ids, android_test_data, padding_id) print 'Evaluating on dev set.' train_utils.evaluate_auc(args, model_encoder, embedding, android_dev_batches, padding_id) print 'Evaluating on test set.' train_utils.evaluate_auc(args, model_encoder, embedding, android_test_batches, padding_id) return for epoch in xrange(args.start_epoch, args.epochs): encoder_train_batches = batch_utils.generate_train_batches( ubuntu_ids, ubuntu_train_data, args.batch_size, padding_id) classifier_train_batches = \ batch_utils.generate_classifier_train_batches( ubuntu_ids, android_ids, args.batch_size, len(encoder_train_batches), padding_id) train_utils.train_encoder_classifer( args, model_encoder, model_classifier, embedding, optimizer_encoder, optimizer_classifier, criterion_encoder, criterion_classifier, zip(encoder_train_batches, classifier_train_batches), padding_id, epoch, args.lmbda) auc = train_utils.evaluate_auc(args, model_encoder, embedding, android_dev_batches, padding_id) is_best = auc > best_auc best_auc = max(auc, best_auc) save( args, { 'epoch': epoch + 1, 'arch': 'lstm', 'encoder_state_dict': model_encoder.state_dict(), 'classifier_state_dict': model_classifier.state_dict(), 'best_auc': best_auc, }, is_best)
def train_model(lambda_val, embedding_size, hidden_size, filter_width, max_or_mean, max_num_epochs, batch_size, learning_rate_1, learning_rate_2, loss_margin, training_checkpoint, dropout_prob, eval_batch_size): global load_model_path, train_data_ubuntu_1, train_data_ubuntu_2, train_data_android_2, source_questions global dev_pos_data, dev_neg_data, test_pos_data, test_neg_data, target_questions global dev_data, dev_label_dict, test_data, test_label_dict, opt_mrr, opt_model_params # Generate model cnn = CNN(embedding_size, hidden_size, filter_width, max_or_mean, dropout_prob) optimizer_1 = optim.Adam(cnn.parameters(), lr=learning_rate_1) criterion_1 = nn.MultiMarginLoss(margin=loss_margin) ffn = FFN(hidden_size) optimizer_2 = optim.Adam(ffn.parameters(), lr=learning_rate_2) criterion_2 = nn.functional.cross_entropy init_epoch = 1 # Training print("***************************************") print("Starting run with following parameters:") print(" --lambda: %f" % (lambda_val)) print(" --embedding size: %d" % (cnn.input_size)) print(" --hidden size: %d" % (cnn.hidden_size)) print(" --filter width: %d" % (cnn.n)) print(" --dropout: %f" % (cnn.dropout_prob)) print(" --pooling: %s" % (cnn.max_or_mean)) print(" --initial epoch: %d" % (init_epoch)) print(" --number of epochs: %d" % (max_num_epochs)) print(" --batch size: %d" % (batch_size)) print(" --learning rate 1: %f" % (learning_rate_1)) print(" --learning rate 2: %f" % (learning_rate_2)) print(" --loss margin: %f" % (loss_margin)) start = time.time() current_loss = 0 for iter in range(init_epoch, max_num_epochs + 1): current_loss += train(cnn, ffn, criterion_1, criterion_2, optimizer_1, optimizer_2, train_data_ubuntu_1, (train_data_ubuntu_2, train_data_android_2), (source_questions, target_questions), batch_size, lambda_val) if iter % training_checkpoint == 0: print("Epoch %d: Average Train Loss: %.5f, Time: %s" % (iter, (current_loss / training_checkpoint), timeSince(start))) d_auc = evaluate_auc(cnn, dev_pos_data, dev_neg_data, target_questions, eval_batch_size) t_auc = evaluate_auc(cnn, test_pos_data, test_neg_data, target_questions, eval_batch_size) print("Dev AUC(0.05): %.2f" % (d_auc)) print("Test AUC(0.05): %.2f" % (t_auc)) current_loss = 0 # Compute final results print("-------") print("FINAL RESULTS:") d_auc = evaluate_auc(cnn, dev_pos_data, dev_neg_data, target_questions, eval_batch_size) t_auc = evaluate_auc(cnn, test_pos_data, test_neg_data, target_questions, eval_batch_size) print("Training time: %s" % (timeSince(start))) print("Dev AUC(0.05): %.2f" % (d_auc)) print("Test AUC(0.05): %.2f" % (t_auc)) return (d_auc, t_auc)
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top3 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) #print type(target.float()) if 'L1' in args.arch or args.L1 == 1 or args.labelboost > 1e-6 or args.focal > 0: targetTensor = np.zeros((input.size()[0], args.nclass)) for j in range(input.size()[0]): targetTensor[j, target[j]] = 1.0 #targetTensor = targetTensor[:input.size[0],:input.size[1]] targetTensor = torch.FloatTensor(targetTensor) targetTensor = targetTensor.cuda(async=True) target = target.cuda(async=True) target_var = torch.autograd.Variable(targetTensor) elif args.labelnocompete > 0: targetTensor = np.concatenate([ np.zeros((input.size()[0], args.nclass)) ,\ np.ones((input.size()[0], args.nclass))], \ axis=1) for j in range(input.size()[0]): targetTensor[j, target[j]] = 1.0 targetTensor[j, target[j] + args.nclass] = 0.0 targetTensor = torch.FloatTensor(targetTensor) targetTensor = targetTensor.cuda(async=True) target = target.cuda(async=True) target_var = torch.autograd.Variable(targetTensor) elif args.labelsm: targetTensor = np.zeros((input.size()[0], args.nclass)) for j in range(input.size()[0]): targetTensor[j, target[j]] = 1.0 targetTensor = (targetTensor * current_labelsm(epoch) + (1 - current_labelsm(epoch)) / args.nclass) targetTensor = torch.FloatTensor(targetTensor) targetTensor = targetTensor.cuda(async=True) target = target.cuda(async=True) target_var = torch.autograd.Variable(targetTensor) else: target = target.cuda(async=True) target_var = torch.autograd.Variable(target) input_var = torch.autograd.Variable(input) # compute output output = model(input_var) if args.labelsm: #print input.size(), output.size(), target_var.size() output = nn.LogSoftmax()(output) #print output.data[0] loss = torch.mean(torch.sum(torch.mul(-output, target_var), 1)) elif args.L1: output = nn.Softmax()(output) loss = nn.SmoothL1Loss()(output * args.nclass, target_var * args.nclass) elif args.MarginP > 0: loss = nn.MultiMarginLoss(p=args.MarginP, margin=args.MarginV)(output, target_var) elif abs(args.labelboost) > 1e-6: # Boosted CNN Implementation outq = nn.LogSoftmax()(output[:, :args.nclass]) outp = nn.Softmax()(output[:, :args.nclass]) #print "outp",(outp - outp[target]).data[0] # w = outp[target]#**(-1.0/args.nclass) # w = outp[target] #print outp.size(), target_var.size() #print (outp * target_var).data[0] w = (1.0 / args.nclass + torch.sum(outp * target_var, 1))**(-1.0 / args.labelboost) w = w / torch.sum(w) #w = torch.exp(( - output + outp[target]) * (-0.5)) #print "w",w.data[0] #print target_var.size(), (1 - torch.sum(w,1)).expand(input.size()[0], args.nclass).size() # w1 = w + torch.mul(target_var , ( - torch.sum(w,1) ).expand(input.size()[0], args.nclass) ) #print w1.data[0] #print torch.sum( torch.mul( -outq , w ) , 1 ).size() #print outq.size() #loss = torch.mean( torch.sum( torch.mul( -outq , (target_var + outp*args.labelboost)/(1.0 + args.labelboost) ) , 1 )) #loss = torch.mean( torch.sum( w , 1 ) ) loss = torch.sum( torch.mul(w, torch.sum(torch.mul(-outq, target_var), 1))) elif args.focal > 0: outq = nn.LogSoftmax()(output[:, :args.nclass]) outp = nn.Softmax()(output[:, :args.nclass]) OneMinusPToGamma = (1.0 - torch.sum(outp * target_var, 1))**2 LogP = torch.sum(-outq * target_var, 1) loss = torch.mean(torch.mul(OneMinusPToGamma, LogP)) elif args.labelnocompete > 0: ''' isout = output[:,:args.nclass] notout = output[:,args.nclass:args.nclass*2] islabel = target_var[:,:args.nclass] notlabel = target_var[:,args.nclass:args.nclass*2] outdiv = torch.log(torch.exp(isout)+torch.exp(notout)) isoutq = outdiv - isout notoutq = outdiv - notout loss = torch.mean(torch.sum(islabel * isoutq + notlabel * notoutq, 1)) ''' outq = nn.LogSoftmax()(torch.cat( [output[:, :args.nclass], -output[:, :args.nclass]], 1)) loss = torch.mean( torch.sum((-outq * target_var)[:, :args.nclass], 1)) """ outp = nn.Softmax()(output) #print outq.size(),outp.size(),target_var.size() loss = torch.mean(\ torch.sum(\ torch.mul(-outq, target_var * (1.0 + args.labelboost) - outp * (args.labelboost)) ,1)\ ) """ else: loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec3, prec5 = accuracy(output.data[:, :args.nclass], target, topk=(1, 3, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top3.update(prec3[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@3 {top3.val:.3f} ({top3.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top3=top3, top5=top5))
def validate(val_loader, model, criterion): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top3 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() end = time.time() for i, (input, target) in enumerate(val_loader): if 'L1' in args.arch or args.L1 == 1 or args.labelboost > 1e-6: targetTensor = np.zeros((input.size()[0], args.nclass)) for j in range(input.size()[0]): targetTensor[j, target[j]] = 1.0 #targetTensor = targetTensor[:input.size[0],:input.size[1]] targetTensor = torch.FloatTensor(targetTensor) targetTensor = targetTensor.cuda(async=True) target = target.cuda(async=True) target_var = torch.autograd.Variable(targetTensor) else: target = target.cuda(async=True) target_var = torch.autograd.Variable(target, volatile=True) input_var = torch.autograd.Variable(input, volatile=True) # compute output output = model(input_var) if args.L1: output = nn.Softmax()(output) loss = nn.SmoothL1Loss()(output * args.nclass, target_var * args.nclass) elif args.MarginP > 0: loss = nn.MultiMarginLoss(p=args.MarginP, margin=args.MarginV)(output, target_var) elif abs(args.labelboost) > 1e-6: outq = nn.LogSoftmax()(output[:, :args.nclass]) outp = nn.Softmax()(output[:, :args.nclass]) #print "outp",(outp - outp[target]).data[0] #w = torch.exp(( - output + outp[target]) * (-0.5)) #print "w",w.data[0] #print target_var.size(), (1 - torch.sum(w,1)).expand(input.size()[0], args.nclass).size() # w1 = w + torch.mul(target_var , ( - torch.sum(w,1) ).expand(input.size()[0], args.nclass) ) #print w1.data[0] #print torch.sum( torch.mul( -outq , w ) , 1 ).size() loss = torch.mean( torch.sum( torch.mul(-outq, (target_var + outp * args.labelboost) / (1.0 + args.labelboost)), 1)) #loss = torch.mean( torch.sum( w , 1 ) ) """ outp = nn.Softmax()(output) #print outq.size(),outp.size(),target_var.size() loss = torch.mean(\ torch.sum(\ torch.mul(-outq, target_var * (1.0 + args.labelboost) - outp * (args.labelboost)) ,1)\ ) """ else: loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec3, prec5 = accuracy(output.data[:, :args.nclass], target, topk=(1, 3, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top3.update(prec3[0], input.size(0)) top5.update(prec5[0], input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print( 'Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@3 {top3.val:.3f} ({top3.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top3=top3, top5=top5)) print( ' * Prec@1 {top1.avg:.3f} Prec@3 {top3.avg:.3f} Prec@5 {top5.avg:.3f}'. format(top1=top1, top3=top3, top5=top5)) return top1.avg
def main(args): #torch.manual_seed(123) EMBEDDING_DIM = 200 HIDDEN_DIM = 250 num_epochs = 20 task = args.task granularity = args.granularity dict = {} dict_char_ngram = {} word_freq = {} fake_dict = {} oov = [] feature_maps = [50, 100, 150, 200, 200, 200, 200] kernels = [1, 2, 3, 4, 5, 6, 7] charcnn_embedding_size = 15 max_word_length = 20 c2w_mode = False character_ngrams = 3 character_ngrams_2 = None character_ngrams_overlap = False glove_mode = None update_inv_mode = None update_oov_mode = None combine_mode = None lm_mode = None word_mode = (glove_mode, update_inv_mode, update_oov_mode) if torch.cuda.is_available(): basepath = expanduser("~") + '/pytorch/DeepPairWiseWord' else: basepath = expanduser( "~") + '/Documents/research/pytorch/DeepPairWiseWord' if task == 'url': num_class = 2 trainset = readURLdata(basepath + '/data/url/train/', granularity) testset = readURLdata(basepath + '/data/url/test_9324/', granularity) elif task == 'quora': num_class = 2 trainset = readURLdata(basepath + '/data/quora/train/', granularity) testset = readURLdata(basepath + '/data/quora/test/', granularity) elif task == 'msrp': num_class = 2 trainset = readURLdata(basepath + '/data/msrp/train/', granularity) testset = readURLdata(basepath + '/data/msrp/test/', granularity) elif task == 'sick': num_class = 5 trainset = readSICKdata(basepath + '/data/sick/train/', granularity) devset = readSICKdata(basepath + '/data/sick/dev/', granularity) testset = readSICKdata(basepath + '/data/sick/test/', granularity) elif task == 'pit': num_class = 2 trainset = readPITdata(basepath + '/data/pit/train/', granularity) #devset = readPITdata(basepath+'/data/pit/dev/',granularity) testset = readPITdata(basepath + '/data/pit/test/', granularity) elif task == 'hindi': num_class = 2 trainset = read_Hindi_data(basepath + '/data/hindi/train/', granularity) testset = read_Hindi_data(basepath + '/data/hindi/test/', granularity) elif task == 'sts': num_class = 6 trainset = readSTSdata(basepath + '/data/sts/train/', granularity) testset = readSTSdata(basepath + '/data/sts/test/', granularity) elif task == 'snli': num_class = 3 trainset = readSNLIdata(basepath + '/data/snli/train/', granularity) testset = readSNLIdata(basepath + '/data/snli/test/', granularity) elif task == 'mnli': num_class = 3 trainset = readMNLIdata(basepath + '/data/mnli/train/', granularity) devset_m = readMNLIdata(basepath + '/data/mnli/dev_m/', granularity) devset_um = readMNLIdata(basepath + '/data/mnli/dev_um/', granularity) testset_m = readMNLIdata(basepath + '/data/mnli/test_m/', granularity) testset_um = readMNLIdata(basepath + '/data/mnli/test_um/', granularity) elif task == 'wiki': ''' _name_to_id = { 'counter-vandalism': 0, 'fact-update': 1, 'refactoring': 2, 'copy-editing': 3, 'other': 4, 'wikification': 5, 'vandalism': 6, 'simplification': 7, 'elaboration': 8, 'verifiability': 9, 'process': 10, 'clarification': 11, 'disambiguation': 12, 'point-of-view': 13 } ''' num_class = 14 data = pickle.load(open(basepath + "/data/wiki/data.cpickle", "rb")) left = [] right = [] label = [] id = [] for i in range(2976): id.append(data[i][0]) label.append([int(item) for item in data[i][3][0]]) left_sent = [item.encode('utf-8') for item in data[i][1][0]] right_sent = [item.encode('utf-8') for item in data[i][2][0]] shared = [] for item in left_sent: if item in right_sent: shared.append(item) for item in shared: if item in left_sent and item in right_sent: left_sent.remove(item) right_sent.remove(item) if len(left_sent) == 0: left_sent = ['<EMPTY-EDIT>'] if len(right_sent) == 0: right_sent = ['<EMPTY-EDIT>'] left.append(left_sent) right.append(right_sent) #print(left_sent) #print(right_sent) #print(id[0]) #print('*'*20) trainset = (left, right, label) #sys.exit() left = [] right = [] label = [] for i in range(2376, 2976): id.append(data[i][0]) label.append([int(item) for item in data[i][3][0]]) left_sent = [item.encode('utf-8') for item in data[i][1][0]] right_sent = [item.encode('utf-8') for item in data[i][2][0]] shared = [] for item in left_sent: if item in right_sent: shared.append(item) for item in shared: if item in left_sent and item in right_sent: left_sent.remove(item) right_sent.remove(item) if len(left_sent) == 0: left_sent = ['<EMPTY-EDIT>'] if len(right_sent) == 0: right_sent = ['<EMPTY-EDIT>'] left.append(left_sent) right.append(right_sent) testset = (left, right, label) elif task == 'wikiqa': num_class = 2 trainset = readURLdata(basepath + '/data/wikiqa/train/', granularity) testset = readURLdata(basepath + '/data/wikiqa/test/', granularity) elif task == 'trecqa': num_class = 2 trainset = readURLdata(basepath + '/data/trecqa/train-all/', granularity) testset = readURLdata(basepath + '/data/trecqa/raw-test/', granularity) else: print('wrong input for the first argument!') sys.exit() if granularity == 'word': tokens = [] count = 0 num_inv = 0 num_oov = 0 glove_mode = True update_inv_mode = True update_oov_mode = True word_mode = (glove_mode, update_inv_mode, update_oov_mode) if task == 'sick' or task == 'quora' or task == 'msrp': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) dict = {} EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict={} #wv_arr={} for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 #print(word) oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) elif task == 'sts': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) dict = {} #EMBEDDING_DIM = 200 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) #EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict={} #wv_arr={} #oov = [] #for line in open(basepath + '/data/' + task + '/oov.txt'): # line = line.strip() # oov.append(line) #inv = [] #for line in open(basepath + '/data/' + task + '/inv_14000.txt'): # line = line.strip() # inv.append(line) # count=len(oov)+len(inv) #inv = tokens num_oov = 0 num_inv = 0 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) elif task == 'snli' or task == 'wikiqa' or task == 'trecqa' or task == 'mnli': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) dict = {} #EMBEDDING_DIM = 200 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) #EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath+'/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) num_oov = 0 num_inv = 0 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) #dict_char_ngram = pickle.load(open(basepath + '/data/' + task + '/char_dict.p', "rb")) #word_freq = pickle.load(open(basepath + '/data/' + task + '/word_freq.p', "rb")) dict_char_ngram = {} word_freq = {} elif task == 'hindi': #words, embeddings = pickle.load(open(basepath+'/data/hindi/polyglot-hi.pkl', 'rb')) #print("Emebddings shape is {}".format(embeddings.shape)) #print words[777], embeddings[777] embeddings_file_bin = basepath + '/data/hindi/hi/hi.bin' model_bin = KeyedVectors.load(embeddings_file_bin) #print(words[777], model_bin[words[777]]) #sys.exit() for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip().decode('utf-8')) dict = {} EMBEDDING_DIM = 300 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = model_bin[word] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) elif task == 'url' or task == 'pit': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) # print(len(tokens)) dict = {} EMBEDDING_DIM = 200 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) #EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict={} #wv_arr={} # print(len(wv_dict)) #oov = [] #for line in open(basepath+'/data/'+task+'/oov.txt'): # line = line.strip() # oov.append(line) #inv=[] #for line in open(basepath+'/data/'+task+'/inv_4000.txt'): # line = line.strip() # inv.append(line) #count=len(oov)+len(inv) #inv = tokens num_oov = 0 num_inv = 0 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) print('finished loading word vector, there are ' + str(num_inv) + ' INV words and ' + str(num_oov) + ' OOV words.') print('current task: ' + task + ', glove mode = ' + str(glove_mode) + ', update_inv_mode = ' + str(update_inv_mode) + ', update_oov_mode = ' + str(update_oov_mode)) saved_file = 'current task: ' + task + ', glove mode = ' + str( glove_mode) + ', update_inv_mode = ' + str( update_inv_mode) + ', update_oov_mode = ' + str( update_oov_mode) + '.txt' #subprocess.call(['echo','finished loading word vector, there are ',str(num_inv),' INV words and ',str(len(oov)),' OOV words.']) elif granularity == 'char': # charcnn parameters feature_maps = [50, 100, 150, 200, 200, 200, 200] kernels = [1, 2, 3, 4, 5, 6, 7] charcnn_embedding_size = 15 max_word_length = 20 #c2w parameters lm_mode = False c2w_mode = False character_ngrams = 1 character_ngrams_overlap = True tokens = [] if task != 'wiki': if task == 'hindi': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip().decode('utf-8')) tokens.append('<s>'.decode()) tokens.append('</s>'.decode()) tokens.append('oov'.decode()) else: for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) org_tokens = tokens[:] tokens.append('<s>') tokens.append('</s>') tokens.append('oov') word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) if c2w_mode: EMBEDDING_DIM = 200 else: EMBEDDING_DIM = 1100 if character_ngrams == 1: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) elif character_ngrams == 2 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict.p', "rb")) elif character_ngrams == 2 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p', "rb")) elif character_ngrams == 3 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict.p', "rb")) elif character_ngrams == 3 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p', "rb")) print('current task: ' + task + ', lm mode: ' + str(lm_mode) + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str(character_ngrams) + ', overlap = ' + str(character_ngrams_overlap) + '.') saved_file = 'current task: ' + task + ', lm mode: ' + str( lm_mode) + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str( character_ngrams) + ', overlap = ' + str( character_ngrams_overlap) + '.txt' elif granularity == 'mix': tokens = [] num_oov = 0 num_inv = 0 for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) tokens.append('<s>') tokens.append('</s>') tokens.append('oov') # print(len(tokens)) dict = {} #oov=[] if task == 'sts': EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) else: EMBEDDING_DIM = 200 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) ''' EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) ''' oov = [] for word in tokens: ''' if word in oov or word in inv: count+=1 dict[word] = torch.Tensor([0 for i in range(EMBEDDING_DIM)]) else: dict[word] = wv_arr[wv_dict[word]] num_inv+=1 ''' try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) # print(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) #dict[word] = torch.Tensor([0 for i in range(EMBEDDING_DIM)]) lm_mode = False combine_mode = 'g_0.75' # 'concat', 'g_0.25', 'g_0.50', 'g_0.75', 'adaptive', 'attention', 'backoff' # c2w parameters c2w_mode = False character_ngrams = 1 #character_ngrams_2 = 3 character_ngrams_overlap = False if character_ngrams == 1: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) elif character_ngrams == 2 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict.p', "rb")) elif character_ngrams == 2 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p', "rb")) elif character_ngrams == 3 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict.p', "rb")) elif character_ngrams == 3 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p', "rb")) ''' if character_ngrams_2 == 1: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/char_dict.p', "rb")) elif character_ngrams_2 == 2 and character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/bigram_dict.p', "rb")) elif character_ngrams_2 == 2 and not character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p', "rb")) elif character_ngrams_2 == 3 and character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/trigram_dict.p', "rb")) elif character_ngrams_2 == 3 and not character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p', "rb")) ''' word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) print('current task: ' + task + ', lm mode: ' + str(lm_mode) + ', combination mode: ' + combine_mode + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str(character_ngrams) + ', overlap = ' + str(character_ngrams_overlap) + '.') print('finished loading word & char table, there are ' + str(num_inv) + ' INV words and ' + str(num_oov) + ' OOV words.') elif granularity == 'cross': oov = [] dict_char = [] tokens = [] word_freq = [] overlap = True if overlap: dict_ngram = pickle.load( open(basepath + '/data/' + task + '/cross_trigram_dict.p', "rb")) else: dict_ngram = pickle.load( open( basepath + '/data/' + task + '/cross_trigram_dict_no_overlap.p', "rb")) else: print('wrong input for the second argument!') sys.exit() model = DeepPairWiseWord(EMBEDDING_DIM, HIDDEN_DIM, 1, task, granularity, num_class, dict, fake_dict, dict_char_ngram, oov, tokens, word_freq, feature_maps, kernels, charcnn_embedding_size, max_word_length, character_ngrams, c2w_mode, character_ngrams_overlap, word_mode, combine_mode, lm_mode) #, corpus) #print(get_n_params(model)) #sys.exit() #print(model.lm_train_data) #sys.exit() #premodel=DeepPairWiseWord(EMBEDDING_DIM,HIDDEN_DIM,1,task,granularity,num_class,dict,dict_char,oov) #premodel.load_state_dict(torch.load('model_char_only.pkl')) #premodel=torch.load('model_char_only.pkl') #model.embedding=premodel.embedding #model.lstm_c2w=premodel.lstm_c2w #model.df=premodel.df #model.db=premodel.db #model.bias=premodel.bias if torch.cuda.is_available(): model = model.cuda() lsents, rsents, labels = trainset #print(len(lsents)) #threshold=40000 #lsents = lsents[:threshold] #rsents = rsents[:threshold] #labels = labels[:threshold] # Loss and Optimizer if task == 'sick' or task == 'sts' or task == 'snli': indices = torch.randperm(len(lsents)) print('indices:') print(indices[:10]) #for line in open('./data/sick/order.txt'): # indices.append(int(line.strip()) - 1) criterion = nn.KLDivLoss() if torch.cuda.is_available(): criterion = criterion.cuda() elif task == 'url' or task == 'pit' or task == 'hindi' or task == 'quora' or task == 'msrp' or task == 'wikiqa' or task == 'trecqa' or task == 'mnli': ''' indices = torch.randperm(len(trainset[0])) with open('./data/'+task+'/order.txt','w') as f: for item in indices: f.writelines(str(item)+'\n') ''' #indices = [] #for line in open('./data/'+task+'/order.txt'): # indices.append(int(line.strip())) indices = torch.randperm(len(lsents)) #print('indices:') #print(indices[:10]) criterion = nn.MultiMarginLoss(p=1, margin=1.0, weight=None, size_average=True) if torch.cuda.is_available(): criterion = criterion.cuda() elif task == 'wiki': indices = torch.randperm(len(lsents)) print('indices:') print(indices[:10]) criterion = nn.MultiLabelSoftMarginLoss() if torch.cuda.is_available(): criterion = criterion.cuda() optimizer = torch.optim.RMSprop( model.parameters(), lr=0.0001 ) #, momentum=0.1, weight_decay=0.05)#,momentum=0.9,weight_decay=0.95) #optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) # Train the Model #print(oov) print('start training') #subprocess.call(['echo','start training']) gold = [] gold_um = [] if task == 'url': for line in open(basepath + '/data/' + task + '/test_9324/sim.txt'): gold.append(int(line.strip())) elif task == 'snli': for line in open(basepath + '/data/' + task + '/test/sim.txt'): gold.append(line.strip()) elif task == 'trecqa': for line in open(basepath + '/data/' + task + '/raw-test/sim.txt'): gold.append(float(line.strip())) elif task == 'mnli': pass ''' for line in open(basepath+'/data/' + task + '/dev_m/sim.txt'): gold.append(float(['neutral', 'entailment','contradiction'].index(line.strip()))) for line in open(basepath+'/data/' + task + '/dev_um/sim.txt'): gold_um.append(float(['neutral', 'entailment','contradiction'].index(line.strip()))) ''' else: for line in open(basepath + '/data/' + task + '/test/sim.txt'): gold.append(float(line.strip())) max_result = -1 max_result_um = -1 batch_size = 32 report_interval = 50000 for epoch in range(num_epochs): print('--' * 20) model.train() optimizer.zero_grad() start_time = time.time() data_loss = 0 indices = torch.randperm(len(lsents)) train_correct = 0 #print(len(indices)) for index, i in enumerate(indices): #print(index) #start_time = time.time() if granularity == 'word': sentA = lsents[i] sentB = rsents[i] ''' #print(lsents[i]) try: sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in lsents[i]), 0) sentA = Variable(sentA)#.cuda() #print(lsents[i]) #print(sentA) #print(rsents[i]) sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in rsents[i]), 0) sentB = Variable(sentB)#.cuda() except: print(lsents[i]) print(rsents[i]) sys.exit() #print(rsents[i]) #print(sentB) #sys.exit() if torch.cuda.is_available(): sentA=sentA.cuda() sentB=sentB.cuda() sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM) sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM) # label=torch.unsqueeze(label,0) ''' elif granularity == 'char' or granularity == 'mix' or granularity == 'cross': #sentA=[] #sentB=[] #for word in lsents[i]: # sentA.append([dict[char] for char in word]) #for word in rsents[i]: # sentB.append([dict[char] for char in word]) #print(i) sentA = lsents[i] sentB = rsents[i] if task == 'sick' or task == 'sts' or task == 'snli' or task == 'wiki': label = Variable(torch.Tensor(labels[i])) else: label = Variable(torch.LongTensor(labels[i])) #.cuda() if torch.cuda.is_available(): label = label.cuda() # Forward + Backward + Optimize #elapsed_time = time.time() - start_time #print('data preparation time: '+str(timedelta(seconds=elapsed_time))) #print(sentA) #print(sentB) #print(id[i]) #print('*'*20) output, extra_loss = model(sentA, sentB, index) #tmp_output = np.exp(output.data[0].cpu().numpy()) #print index, 'gold: ', labels[i][0], 'predict: ', np.argmax(tmp_output) #print(extra_loss) loss = criterion(output, label) + extra_loss loss.backward() data_loss += loss.data[0] output = np.exp(output.data[0].cpu().numpy()) if labels[i][0] == np.argmax(output): train_correct += 1 #print(loss-extra_loss) #print('*'*20) if (index + 1) % batch_size == 0: optimizer.step() optimizer.zero_grad() if (index + 1) % report_interval == 0: msg = '%d completed epochs, %d batches' % (epoch, index + 1) msg += '\t train batch loss: %f' % (data_loss / (index + 1)) train_acc = train_correct / (index + 1) msg += '\t train accuracy: %f' % train_acc print(msg) if (index + 1) % (int(len(lsents) / 2)) == 0: #print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.6f' # % (epoch + 1, num_epochs, index + 1, len(lsents) // 1, data_loss))#loss.data[0])) #subprocess.call(['echo','Epoch ',str(epoch+1),'Loss: ',str(data_loss)]) #break #data_loss = 0 #torch.save(model.state_dict(), 'model.pkl') #model.load_state_dict(torch.load('model_char_only.pkl')) if task == 'sick' or task == 'sts' or task == 'snli' or task == 'wiki': model.eval() test_lsents, test_rsents, test_labels = testset predicted = [] tmp_result = 0 #gold=[] #for line in open('./data/sick/test/sim.txt'): # gold.append(float(line.strip())) for test_i in range(len(test_lsents)): if granularity == 'word': ''' sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_lsents[test_i]), 0) sentA = Variable(sentA) # print(sentA) sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_rsents[test_i]), 0) sentB = Variable(sentB) if torch.cuda.is_available(): sentA = sentA.cuda() sentB = sentB.cuda() #label = torch.unsqueeze(label, 0) sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM) sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM) ''' sentA = test_lsents[test_i] sentB = test_rsents[test_i] elif granularity == 'char' or granularity == 'mix': sentA = test_lsents[test_i] sentB = test_rsents[test_i] raw_output, _ = model(sentA, sentB, index) #print(output) if task == 'sick': output = raw_output output = np.exp(output.data[0].cpu().numpy()) predicted.append(1 * output[0] + 2 * output[1] + 3 * output[2] + 4 * output[3] + 5 * output[4]) elif task == 'snli': output = raw_output output = np.exp(output.data[0].cpu().numpy()) output = [output[0], output[1], output[2]] tmp_output = output.index(max(output)) predicted.append(tmp_output) if test_labels[test_i].index( max(test_labels[test_i])) == tmp_output: tmp_result += 1 elif task == 'wiki': output = torch.sigmoid(raw_output).data > 0.5 output = output.cpu() predicted = list(output.numpy()[0]) if predicted == test_labels[test_i]: tmp_result += 1 else: output = raw_output output = np.exp(output.data[0].cpu().numpy()) predicted.append(0 * output[0] + 1 * output[1] + 2 * output[2] + 3 * output[3] + 4 * output[4] + 5 * output[5]) #print(predicted) #print(gold) if task == 'sick': result = pearson(predicted, gold) print('Test Correlation: %.6f' % result) if result > max_result: max_result = result elif task == 'snli' or task == 'wiki': result = tmp_result / len(test_lsents) print('Test Accuracy: %.6f' % result) if result > max_result: max_result = result else: result1 = pearson(predicted[0:450], gold[0:450]) result2 = pearson(predicted[450:750], gold[450:750]) result3 = pearson(predicted[750:1500], gold[750:1500]) result4 = pearson(predicted[1500:2250], gold[1500:2250]) result5 = pearson(predicted[2250:3000], gold[2250:3000]) result6 = pearson(predicted[3000:3750], gold[3000:3750]) print( 'deft-forum: %.6f, deft-news: %.6f, headlines: %.6f, images: %.6f, OnWN: %.6f, tweet-news: %.6f' % (result1, result2, result3, result4, result5, result6)) wt_mean = 0.12 * result1 + 0.08 * result2 + 0.2 * result3 + 0.2 * result4 + 0.2 * result5 + 0.2 * result6 print('weighted mean: %.6f' % wt_mean) if wt_mean > max_result: max_result = wt_mean if task == 'sts': with open(basepath + '/data/sts/sts_PWIM_prob.txt', 'w') as f: for item in predicted: f.writelines(str(item) + '\n') #else: # with open('SICK_with_paragram_result.txt', 'w') as f: # for item in predicted: # f.writelines(str(item)+'\n') else: model.eval() msg = '%d completed epochs, %d batches' % (epoch, index + 1) if task == 'mnli': test_lsents, test_rsents, test_labels = devset_m else: test_lsents, test_rsents, test_labels = testset predicted = [] correct = 0 #gold=gold[:3000] #print(len(gold)) for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] ''' sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_lsents[test_i]), 0) sentA = Variable(sentA)#.cuda() # print(sentA) sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_rsents[test_i]), 0) sentB = Variable(sentB)#.cuda() # print(sentB) if torch.cuda.is_available(): sentA=sentA.cuda() sentB=sentB.cuda() sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM) sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM) # label=torch.unsqueeze(label,0) ''' elif granularity == 'char' or granularity == 'mix': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) #print(output) output = np.exp(output.data[0].cpu().numpy()) if test_labels[test_i][0] == np.argmax(output): correct += 1 predicted.append(output[1]) #result=float(correct)/len(test_lsents) #print('Test Accuracy: %.4f'% result) #result_acc, result_f1=URL_maxF1_eval(predict_result=predicted,test_data_label=gold) result = correct / len(test_lsents) msg += '\t dev m accuracy: %f' % result print(msg) if result > max_result: max_result = result test_lsents, test_rsents, test_labels = testset_m predicted = [] for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) output = np.exp(output.data[0].cpu().numpy()) predicted.append(np.argmax(output)) with open(basepath + '/sub_m.csv', 'w+') as f: label_dict = [ 'neutral', 'entailment', 'contradiction' ] f.write("pairID,gold_label\n") for i, k in enumerate(predicted): f.write( str(i + 9847) + "," + label_dict[k] + "\n") #with open(basepath+'/PWIM_prob_result_'+task, 'w') as f: # for item in predicted: # f.writelines(str(item)+'\n') if task == 'mnli': msg = '%d completed epochs, %d batches' % (epoch, index + 1) test_lsents, test_rsents, test_labels = devset_um predicted = [] correct = 0 for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) # print(output) output = np.exp(output.data[0].cpu().numpy()) if test_labels[test_i][0] == np.argmax(output): correct += 1 predicted.append(output[1]) #result_acc, result_f1 = URL_maxF1_eval(predict_result=predicted, test_data_label=gold_um) result_acc = correct / len(test_lsents) msg += '\t dev um accuracy: %f' % result_acc print(msg) if result_acc > max_result_um: max_result_um = result_acc test_lsents, test_rsents, test_labels = testset_um predicted = [] for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) output = np.exp(output.data[0].cpu().numpy()) predicted.append(np.argmax(output)) with open(basepath + '/sub_um.csv', 'w+') as f: label_dict = [ 'neutral', 'entailment', 'contradiction' ] f.write("pairID,gold_label\n") for i, k in enumerate(predicted): f.write( str(i) + "," + label_dict[k] + "\n") #with open('current task: '+task+', lm mode: '+str(lm_mode)+', combination mode: '+combine_mode+', c2w mode: '+str(c2w_mode)+', n = '+str(character_ngrams)+', overlap = '+str(character_ngrams_overlap)+'.txt','w') as f: # for item in predicted: # f.writelines(str(item)+'\n') #torch.save(model, 'model_URL_unigram_CNN.pkl') #torch.save(model, 'model_word_inv_18k.pkl') #torch.save(model, 'model_word_inv_3k.pkl') #torch.save(model, 'model_char_only.pkl') #torch.save(model, 'model_word_only_pit.pkl') #torch.save(model, 'model_word_char_backoff.pkl') #torch.save(model, 'model_word_char_g_0.5.pkl') #torch.save(model, 'model_word_char_adaptive.pkl') #torch.save(model, 'model_word_char_attention.pkl') #with open('model_word_inv_0k_result.txt', 'w') as f: #with open('sts_model_word_only_inv_17k_result.txt', 'w') as f: #with open('model_word_inv_3k_result.txt', 'w') as f: #with open('model_char_only_result.txt', 'w') as f: #with open('model_word_only_result_pit.txt', 'w') as f: #with open('model_word_char_g_0.5_result.txt', 'w') as f: #with open('model_word_char_backoff_result.txt', 'w') as f: #with open('model_word_char_adaptive.txt', 'w') as f: #with open('model_word_char_attention_result.txt','w') as f: # for item in predicted: # f.writelines(str(item)+'\n') ''' h = Variable(torch.zeros(2, 1, model.embedding_dim)) # 2 for bidirection c = Variable(torch.zeros(2, 1, model.embedding_dim)) if torch.cuda.is_available(): h = h.cuda() c = c.cuda() subword_embedding={} for word in org_tokens: tmp_indices = model.generate_word_indices(word) if not model.c2w_mode: if len(tmp_indices) < 20: tmp_indices = tmp_indices + [0 for i in range(model.charcnn_max_word_length - len(tmp_indices))] else: tmp_indices = tmp_indices[0:20] if model.c2w_mode: output = model.c2w_cell([tmp_indices], h, c) else: output = model.charCNN_cell([tmp_indices]) subword_embedding[word]=output.data[0].cpu().numpy() pickle.dump(subword_embedding, open('URL_subword_lm_embedding.p', "wb")) ''' elapsed_time = time.time() - start_time print('Epoch ' + str(epoch + 1) + ' finished within ' + str(timedelta(seconds=elapsed_time)) + ', and current time:' + str(datetime.now())) print('Best result until now: %.6f' % max_result) print('Best um result until now: %.6f' % max_result_um) #subprocess.call(['echo','Epoch ' , str(epoch + 1) , ' finished within ' , str(timedelta(seconds=elapsed_time)),', and current time:', str(datetime.now())]) #subprocess.call(['echo','Best result until now: ',str(max_result)]) model.train()
i_1 = torch.log(1 / (1 + torch.exp(-inputs[0, 1]))) i_2 = torch.log(1 / (1 + torch.exp(-inputs[0, 2]))) loss_h = (i_0 + i_1 + i_2) / -3 print(loss_h) # ---------------------------------------------- 14 Multi Margin Loss ----------------------------------------- flag = 0 # flag = 1 if flag: x = torch.tensor([[0.1, 0.2, 0.7], [0.2, 0.5, 0.3]]) y = torch.tensor([1, 2], dtype=torch.long) loss_f = nn.MultiMarginLoss(reduction='none') loss = loss_f(x, y) print("Multi Margin Loss: ", loss) # --------------------------------- compute by hand flag = 0 # flag = 1 if flag: x = x[0] margin = 1 i_0 = margin - (x[1] - x[0]) # i_1 = margin - (x[1] - x[1])
out_params.append(param) out_names.append(name) else: in_params.append(param) in_names.append(name) in_size, out_size = [x.size() for x in in_params], [x.size() for x in out_params] in_sum, out_sum = sum([np.prod(x) for x in in_size]), sum([np.prod(x) for x in out_size]) print "IN : {} params".format(in_sum) #print print_params(in_names, in_size) print "OUT : {} params".format(out_sum) #print print_params(out_names, out_size) print "TOTAL : {} params".format(in_sum + out_sum) loss_fn = {'xent':nn.CrossEntropyLoss(), 'mse':nn.MSELoss(), 'mrl':nn.MarginRankingLoss(), 'mlml':nn.MultiLabelMarginLoss(), 'mml':nn.MultiMarginLoss()} tt = torch if not args.cpu: loss_fn = {k:v.cuda() for (k,v) in loss_fn.iteritems()} tt = torch.cuda optimizer = torch.optim.Adam(in_params, lr=args.lr) out_data = {'train':{'x':[], 'y':[] }, \ 'valid':{'x':[], 'y':[] }, \ 'bleu':{'x':[], 'y':[] }, \ 'best_valid':{'x':[], 'y':[] } } best_epoch = -1 best_bleu = {"valid":{0:0}, "test":{0:0}}
# Load data print("LOADING DATA...") embedding = create_embedding_dict(word_embedding_path) questions = create_question_dict(question_path, embedding, hidden_size) train_data = read_training_data(train_data_path) dev_data, dev_label_dict, dev_scores = read_eval_data(dev_data_path) test_data, test_label_dict, test_scores = read_eval_data(test_data_path) if DEBUG: train_data = train_data[: 300] # ONLY FOR DEBUGGING, REMOVE LINE TO RUN ON ALL TRAINING DATA # Create model rnn = RNN(n_features, hidden_size, n_layers, batch_size=22) optimizer = optim.Adam(rnn.parameters(), lr=learning_rate) criterion = nn.MultiMarginLoss(margin=0.2) # Training print( "Starting run with batch_size: %d, hidden size: %d, learning rate: %.4f" % (outer_batch_size, hidden_size, learning_rate)) start = time.time() current_loss = 0 for iter in range(1, n_epochs + 1): avg_loss = train(rnn, criterion, optimizer, train_data, questions, hidden_size,
#Displays an example of the code to make sure that everything is running correctly. dataiter = iter(training_loader) images, labels = dataiter.next() images.size() #Displays an example of the code to make sure that everything is running correctly. image_grid = torchvision.utils.make_grid(images, normalize=True) plt.imshow(np.transpose(image_grid.numpy(), (1, 2, 0)), interpolation='nearest') plt.show() #Instantiate the ConvolutionalNeuralNetwork() cnn = ConvolutionalNeuralNetwork() cnn.to(device=DEVICE) #Select the loss function loss_function = nn.MultiMarginLoss() #Select the learning rate learning_rate = 0.01 #Select the optimizer optimizer = torch.optim.SGD(cnn.parameters(), lr=learning_rate) #Visualizes the network architecture. make_dot(cnn(images.to(device=DEVICE)), params=dict(cnn.named_parameters())) #iterations iter = 0 epochs = 2000 #There is a training and testing set. If you want to run the testing set, comment out the training set. If you want #to run the training set, comment out the testing set. If one of the two is not commented out, it will take much
def train(): data_transforms = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_data = PlantSeedlingDataset(root_dir='train', transform=data_transforms) data_loader = DataLoader(train_data, batch_size=32, shuffle=True) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = VGG11(num_classes=train_data.num_classes).to(device) model.train() criterion = nn.MultiMarginLoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train_loss, train_acc = [], [] best_model_params = copy.deepcopy(model.state_dict()) best_acc = 0 num_epochs = 80 for epoch in range(num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, num_epochs)) running_train_loss = 0.0 running_train_acc = 0 for i, data in enumerate(data_loader): images, labels = data[0].to(device), data[1].to(device) optimizer.zero_grad() # forward outputs = model(images) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) # backward loss.backward() optimizer.step() running_train_loss += loss.item() * images.size(0) running_train_acc += torch.sum(preds == labels) print('Training Loss: {:.4f}, Training Accuracy: {:.4f}'.format( running_train_loss / len(train_data), torch.true_divide(running_train_acc, len(train_data)))) train_loss.append(running_train_loss / len(train_data)) train_acc.append(torch.true_divide(running_train_acc, len(train_data))) if running_train_acc > best_acc: best_acc = running_train_acc best_model_params = copy.deepcopy(model.state_dict()) model.load_state_dict(best_model_params) torch.save(model, 'VGG11_model_SVM.pth') plt.title("Loss Curve") plt.plot(range(num_epochs), train_loss, color='red', label="Training loss") plt.xlabel("Loss") plt.ylabel("Epochs") plt.savefig("loss_curve_SVM.png") plt.show() plt.title("Accuracy Curve") plt.plot(range(num_epochs), train_acc, color='red', label="Training Accuracy") plt.xlabel("Accuracy") plt.ylabel("Epochs") plt.savefig("accuracy_curve_SVM.png") plt.show()
out['r42'] = F.relu(self.conv4_2(out['r41'])) out['r43'] = F.relu(self.conv4_3(out['r42'])) out['r44'] = F.relu(self.conv4_4(out['r43'])) out['p4'] = self.pool4(out['r44']) out['r51'] = F.relu(self.conv5_1(out['p4'])) out['r52'] = F.relu(self.conv5_2(out['r51'])) out['r53'] = F.relu(self.conv5_3(out['r52'])) out['r54'] = F.relu(self.conv5_4(out['r53'])) out['p5'] = self.pool5(out['r54']) return [out[key] for key in out_keys] img_neighbor = np.load('img_neighbors_pairs_train_test.npy') celoss = nn.CrossEntropyLoss() marginLoss = nn.MultiMarginLoss(margin=1) # gram matrix and loss class GramMatrix(nn.Module): def forward(self, input): b, c, h, w = input.size() F = input.view(b, c, h * w) G = torch.bmm(F, F.transpose(1, 2)) G.div_(h * w) return G class GramMSELoss(nn.Module): def forward(self, input, target): out = torch.log(nn.MSELoss()(GramMatrix()(input), target))
def transfer_classification(config): class_criterion = nn.MultiMarginLoss() loss_config = config["loss"] transfer_criterion = loss.loss_dict[loss_config["name"]] if "params" not in loss_config: loss_config["params"] = {} class_num = 6 ## set base networks net_config = config["network"] base_network_s = network.network_dict[net_config["name_s"]]() base_network_t = network.network_dict[net_config["name_t"]]() P_network_s = network.network_dict[net_config["name_s"]]() P_network_t = network.network_dict[net_config["name_t"]]() base_space_reverse = network.network_dict["base_space"]() generator_mmd = network.network_dict["generator_mmd"]() discriminator_mmd = network.network_dict["MMD_discriminator"]() classifier_layer_t = nn.Sequential( nn.Linear(generator_mmd.output_num(), class_num), ) classifier_layer_s = nn.Sequential( nn.Linear(generator_mmd.output_num(), class_num), ) reconstruct_common = nn.Sequential( nn.Linear(base_space_reverse.output_num(), base_space_reverse.output_num(), bias=False), ) reconstruct_s = nn.Sequential( nn.Linear(base_space_reverse.output_num(), source_dim, bias=False), ) reconstruct_t = nn.Sequential( nn.Linear(base_space_reverse.output_num(), target_dim, bias=False), ) use_gpu = torch.cuda.is_available() if use_gpu: classifier_layer_t = classifier_layer_t.cuda() classifier_layer_s = classifier_layer_s.cuda() base_space_reverse = base_space_reverse.cuda() discriminator_mmd = discriminator_mmd.cuda() generator_mmd = generator_mmd.cuda() base_network_t = base_network_t.cuda() base_network_s = base_network_s.cuda() P_network_s = P_network_s.cuda() P_network_t = P_network_t.cuda() reconstruct_s = reconstruct_s.cuda() reconstruct_t = reconstruct_t.cuda() reconstruct_common = reconstruct_common.cuda() ## collect parameters parameter_list = [{ "params": classifier_layer_s.parameters(), "lr": 1 }, { "params": classifier_layer_t.parameters(), "lr": 1 }, { "params": base_network_s.parameters(), "lr": 1 }, { "params": base_network_t.parameters(), "lr": 1 }, { "params": base_space_reverse.parameters(), "lr": 1 }, { "params": reconstruct_s.parameters(), "lr": 1 }, { "params": reconstruct_t.parameters(), "lr": 1 }, { "params": reconstruct_common.parameters(), "lr": 1 }, { "params": P_network_s.parameters(), "lr": 1 }, { "params": P_network_t.parameters(), "lr": 1 }] parameter_mmd_list = [{"params": discriminator_mmd.parameters(), "lr": 1}] parameter_mmd_gen_list = [{"params": generator_mmd.parameters(), "lr": 1}] assert base_network_s.output_num() == base_network_t.output_num() ## set optimizer optimizer_config = config["optimizer"] optimizer = optim_dict[optimizer_config["type"]]( parameter_list, **(optimizer_config["optim_params"])) optimizer_mmd = optim_dict[optimizer_config["type"]]( parameter_mmd_list, **(optimizer_config["optim_params"])) optimizer_mmd_gen = optim_dict[optimizer_config["type"]]( parameter_mmd_gen_list, **(optimizer_config["optim_params"])) param_lr = [] for param_group in optimizer.param_groups: param_lr.append(param_group["lr"]) param_lr_mmd = [] for param_group in optimizer_mmd.param_groups: param_lr_mmd.append(param_group["lr"]) param_lr_mmd_gen = [] for param_group in optimizer_mmd_gen.param_groups: param_lr_mmd_gen.append(param_group["lr"]) schedule_param = optimizer_config["lr_param"] lr_scheduler = lr_schedule.schedule_dict[optimizer_config["lr_type"]] ## train transfer_loss = classifier_loss_t = classifier_loss_s = 0 acc_list = [] for epoch in range(config["num_iterations"]): ## test in the train if epoch % config["test_interval"] == 0: classifier_layer_t.train(False) classifier_layer_s.train(False) base_space_reverse.train(False) discriminator_mmd.train(False) generator_mmd.train(False) base_network_t.train(False) base_network_s.train(False) P_network_s.train(False) P_network_t.train(False) reconstruct_s.train(False) reconstruct_t.train(False) reconstruct_common.train(False) # For visualization purpose #visual_Data_t(nn.Sequential(base_network_t, base_space_reverse,generator_mmd), gpu=use_gpu) #visual_Data_s(nn.Sequential(base_network_s, base_space_reverse,generator_mmd), gpu=use_gpu) acc, valid_acc, test_acc = text_classification_test(nn.Sequential( base_network_t, base_space_reverse, generator_mmd, classifier_layer_t), gpu=use_gpu) with open( 'results/1_batch30_%s_10_0.001_0.003_%s.txt' % (source_lang, str(number)), 'a+') as f: f.write(str(acc)) f.write('\n') acc_list.append([acc]) print(acc) for i in range(config["n_batches"]): ## train one iter optimizer = lr_scheduler(param_lr, optimizer, i, **schedule_param) optimizer_mmd = lr_scheduler(param_lr_mmd, optimizer_mmd, i, **schedule_param) optimizer_mmd_gen = lr_scheduler(param_lr_mmd_gen, optimizer_mmd_gen, i, **schedule_param) optimizer.zero_grad() optimizer_mmd.zero_grad() optimizer_mmd_gen.zero_grad() target_len = target_y.size source_len = source_y.size n_batches = config["n_batches"] local_Xs, local_ys = source_X[ i * n_batches:(i + 1) * n_batches, ], source_y[i * n_batches:(i + 1) * n_batches, ] local_Xt, local_yt = target_X[ i * n_batches:(i + 1) * n_batches, ], target_y[i * n_batches:(i + 1) * n_batches, ] local_Xs, local_ys = unison_shuffled_copies(local_Xs, local_ys) local_Xt, local_yt = unison_shuffled_copies(local_Xt, local_yt) if len(local_Xt) < n_batches: needed = n_batches - len(local_Xt) list_loc_t = [1] * needed + [0] * (len(target_X) - needed) shuffle(list_loc_t) filters = [x == 1 for x in list_loc_t] new_needed_samples_t = target_X[filters] new_needed_labels_t = target_y[filters] local_Xt = np.concatenate((new_needed_samples_t, local_Xt), axis=0) local_yt = np.concatenate((new_needed_labels_t, local_yt), axis=0) if len(local_Xs) < n_batches: needed = n_batches - len(local_Xs) list_loc_s = [1] * needed + [0] * (len(source_X) - needed) shuffle(list_loc_s) filters = [x == 1 for x in list_loc_s] new_needed_samples_s = source_X[filters] new_needed_labels_s = source_y[filters] local_Xs = np.concatenate((new_needed_samples_s, local_Xs), axis=0) local_ys = np.concatenate((new_needed_labels_s, local_ys), axis=0) local_Xs = torch.tensor(local_Xs, dtype=torch.float) local_ys = torch.tensor(local_ys, dtype=torch.long) local_Xt = torch.tensor(local_Xt, dtype=torch.float) local_yt = torch.tensor(local_yt, dtype=torch.long) content_target_tensor = torch.tensor(target_X, dtype=torch.float) if use_gpu: inputs_source, labels_source, inputs_target, labels_target, content_target = Variable( local_Xs).cuda(), Variable(local_ys).cuda(), Variable( target_X_train).cuda(), Variable(target_y_train).cuda( ), Variable(content_target_tensor).cuda() else: inputs_source, labels_source, inputs_target, labels_target = Variable( local_Xs), Variable(local_ys), Variable( target_X_train), Variable(target_y_train) features_s = base_network_s(inputs_source) feature_s_basespace = base_space_reverse(features_s) aligned_features_s = generator_mmd(feature_s_basespace) outputs_s = classifier_layer_t(aligned_features_s) classifier_loss_s = class_criterion( outputs_s, labels_source.reshape(n_batches, )) features_t = base_network_t(inputs_target) feature_t_basespace = base_space_reverse(features_t) prejected_source = P_network_s(inputs_source) prejected_target = P_network_t(inputs_target) reconstructed_s = reconstruct_common(feature_s_basespace) reconstructed_t = reconstruct_common(feature_t_basespace) reconstruct_loss_s = rec_loss_cal(reconstructed_s, prejected_source) reconstruct_loss_t = rec_loss_cal(reconstructed_t, prejected_target) l1_regularization = torch.norm( feature_s_basespace, 1) + torch.norm(feature_t_basespace, 1) aligned_features_t = generator_mmd(feature_t_basespace) features_t_c = base_network_t(content_target) feature_t_c_basespace = base_space_reverse(features_t_c) aligned_features_c_t = generator_mmd(feature_t_c_basespace) outputs_t = classifier_layer_t(aligned_features_t) classifier_loss_t = class_criterion( outputs_t, labels_target.reshape(len(labels_target), )) feature_s_mmd = discriminator_mmd(aligned_features_s) feature_t_mmd = discriminator_mmd(aligned_features_c_t) transfer_loss = transfer_criterion(feature_s_mmd, feature_t_mmd, **loss_config["params"]) for w_i in base_space_reverse.parameters(): A_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2) for w_i in base_network_s.parameters(): temp_value = (torch.norm( (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2)) B_s_reg = torch.mul(temp_value, temp_value) for w_i in base_network_t.parameters(): temp_value = (torch.norm( (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2)) B_t_reg = torch.mul(temp_value, temp_value) for w_i in reconstruct_s.parameters(): temp_value = (torch.norm( (torch.mm(torch.transpose(w_i, 0, 1), w_i) - eyeData), 2)) P_s_reg = torch.mul(temp_value, temp_value) P_s_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2) for w_i in reconstruct_t.parameters(): temp_value = (torch.norm( (torch.mm(torch.transpose(w_i, 0, 1), w_i) - eyeData), 2)) P_t_reg = torch.mul(temp_value, temp_value) P_t_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2) for w_i in P_network_s.parameters(): temp_value = (torch.norm( (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2)) Project_s_reg = torch.mul(temp_value, temp_value) for w_i in P_network_t.parameters(): temp_value = (torch.norm( (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2)) Project_t_reg = torch.mul(temp_value, temp_value) for w_i in reconstruct_common.parameters(): D_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2) classifier_layer_t.train(True) classifier_layer_s.train(True) base_space_reverse.train(True) discriminator_mmd.train(False) generator_mmd.train(True) base_network_t.train(True) base_network_s.train(True) reconstruct_s.train(True) reconstruct_t.train(True) reconstruct_common.train(True) coef = 1e-8 total_loss = 10*classifier_loss_t+ classifier_loss_s+ coef*reconstruct_loss_t\ +coef*reconstruct_loss_s+coef*A_F_2+coef*Project_s_reg+coef*Project_t_reg\ +coef*D_F_2+coef*B_s_reg+coef*B_t_reg total_loss.backward(retain_graph=True) optimizer.step() base_space_reverse.apply(clipper) classifier_layer_t.train(False) classifier_layer_s.train(False) base_space_reverse.train(False) discriminator_mmd.train(True) generator_mmd.train(False) base_network_t.train(False) base_network_s.train(False) reconstruct_s.train(False) reconstruct_t.train(False) reconstruct_common.train(True) transfer_loss_reverse = -transfer_loss transfer_loss_reverse.backward(retain_graph=True) optimizer_mmd.step() classifier_layer_t.train(False) classifier_layer_s.train(False) base_space_reverse.train(False) discriminator_mmd.train(False) generator_mmd.train(True) base_network_t.train(False) base_network_s.train(False) reconstruct_s.train(False) reconstruct_t.train(False) reconstruct_common.train(False) transfer_loss_ = transfer_loss transfer_loss_.backward() optimizer_mmd_gen.step()
def run_epoch(data, is_training, encoder_model_optimizer, domain_model_optimizer, args): ''' Train model for one pass of train data, and return loss, acccuracy ''' encoder_model, encoder_optimizer = encoder_model_optimizer domain_model, domain_optimizer = domain_model_optimizer data_loader = torch.utils.data.DataLoader(data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True) losses = [] if is_training: encoder_model.train() domain_model.train() else: encoder_model.eval() #nll_loss = nn.NLLLoss() #y_true = [] #y_scores = [] auc_met = meter.AUCMeter() for batch in tqdm(data_loader): cosine_similarity = nn.CosineSimilarity(dim=0, eps=1e-6) criterion = nn.MultiMarginLoss(margin=0.3) #pdb.set_trace() if is_training: encoder_optimizer.zero_grad() domain_optimizer.zero_grad() ###source question encoder#### if is_training: samples = batch['samples'] else: samples = batch #output - batch of samples, where every sample is 2d tensor of avg hidden states hidden_rep = runEncoderOnQuestions(samples, encoder_model, args) #Calculate cosine similarities here and construct X_scores #expected datastructure of hidden_rep = batchsize x number_of_q x hidden_size cs_tensor = autograd.Variable( torch.FloatTensor(hidden_rep.size(0), hidden_rep.size(1) - 1)) if args.cuda: cs_tensor = cs_tensor.cuda() #calculate cosine similarity for every query vs. neg q pair for j in range(1, hidden_rep.size(1)): for i in range(hidden_rep.size(0)): cs_tensor[i, j - 1] = cosine_similarity(hidden_rep[i, 0, ], hidden_rep[i, j, ]) X_scores = torch.stack(cs_tensor, 0) y_targets = autograd.Variable( torch.zeros(hidden_rep.size(0)).type(torch.LongTensor)) if args.cuda: y_targets = y_targets.cuda() if is_training: #####domain classifier##### cross_d_questions = batch['question'] avg_hidden_rep = runEncoderOnQuestions(cross_d_questions, encoder_model, args) predicted_domains = domain_model(avg_hidden_rep) true_domains = autograd.Variable( cross_d_questions['domain']).squeeze(1) if args.cuda: true_domains = true_domains.cuda() domain_classifier_loss = F.nll_loss(predicted_domains, true_domains) print "Domain loss in batch", domain_classifier_loss.data #calculate loss encoder_loss = criterion(X_scores, y_targets) print "Encoder loss in batch", encoder_loss.data ''' if encoder_loss.cpu().data.numpy().item() == 0: new_lambda = -new_lambda else: new_lambda = args.lambda_d * 10**(int(math.log10(encoder_loss.cpu().data.numpy().item())) - int(math.log10(domain_classifier_loss.cpu().data.numpy().item()))) print "new lambda is ", new_lambda ''' task_loss = encoder_loss - args.lambda_d * domain_classifier_loss print "Task loss in batch", task_loss.data print "\n\n" task_loss.backward() encoder_optimizer.step() domain_optimizer.step() losses.append(task_loss.cpu().data[0]) else: for i in range(args.batch_size): for j in range(20): y_true = 0 if j == 0: y_true = 1 x = cs_tensor[i, j].data if args.cuda: x = x.cpu().numpy() else: x = x.numpy() auc_met.add(x, y_true) # Calculate epoch level scores if is_training: avg_loss = np.mean(losses) print('Average Train loss: {:.6f}'.format(avg_loss)) print() else: print "AUC:", auc_met.value(0.05)
def train(self): load_start_time = time.time() # get the ubuntu data (labeled ub) ub_questions = self.ub_preprocessor.get_question_dict() ub_candidate_ids = self.ub_preprocessor.get_candidate_ids() ub_ids_batches = self.ub_preprocessor.split_into_batches( ub_candidate_ids.keys(), params.batch_size) if params.use_dom_ad: # get the android data (labeled an) an_questions = self.an_preprocessor.get_question_dict() an_id_pairs = self.an_preprocessor.get_all_id_pairs() # batch the ids an_ids_batches = self.an_preprocessor.split_into_batches( an_id_pairs) # discriminator labels (ubuntu --> 0, android --> 1) # when forwarding thru discriminator, first half if ubuntu, second half if android total_questions_per_batch = self.get_total_questions_per_batch() discr_targets = torch.cat([ torch.ones(total_questions_per_batch / 2), torch.zeros(total_questions_per_batch / 2) ]) discr_targets = Variable(torch.FloatTensor(discr_targets), requires_grad=False) # loss for discriminator bcel = nn.BCELoss() # 2 different optimizers optimizer2 = optim.Adam([{ 'params': self.encoder_net.parameters(), 'lr': params.lambda_reg * params.neg_lr }, { 'params': self.discr_net.parameters() }], lr=params.forward_lr) # loss for classifier mml = nn.MultiMarginLoss(margin=params.margin) # 2 different optimizers optimizer1 = optim.Adam(params=self.encoder_net.parameters(), lr=params.forward_lr) load_total_minutes = (time.time() - load_start_time) / 60.0 print('load_total_time = %f' % (load_total_minutes)) # start looping through batches last_time = time.time() start_time = time.time() total_time = 0.0 if params.use_dom_ad: n_batches = min(len(ub_ids_batches), len(an_ids_batches)) else: n_batches = len(ub_ids_batches) for i_batch in range(n_batches): ub_ids_batch = ub_ids_batches[i_batch] if params.use_dom_ad: an_ids_batch = an_ids_batches[i_batch] # get the input sequences ub_title_seqs, ub_body_seqs = self.get_ub_title_and_body_seqs( ub_questions, ub_candidate_ids, ub_ids_batch) if params.use_dom_ad: an_title_seqs, an_body_seqs = self.get_an_title_and_body_seqs( an_questions, an_ids_batch) # get all the word embedding vectors ub_x_titles = [ self.ub_preprocessor.sequence_to_vec(seq) for seq in ub_title_seqs ] ub_x_bodies = [ self.ub_preprocessor.sequence_to_vec(seq) for seq in ub_body_seqs ] if params.use_dom_ad: an_x_titles = [ self.an_preprocessor.sequence_to_vec(seq) for seq in an_title_seqs ] an_x_bodies = [ self.an_preprocessor.sequence_to_vec(seq) for seq in an_body_seqs ] # get the lengths of all the sequences ub_lens_titles = [ self.ub_preprocessor.get_seq_len(seq) for seq in ub_title_seqs ] ub_lens_bodies = [ self.ub_preprocessor.get_seq_len(seq) for seq in ub_body_seqs ] if params.use_dom_ad: an_lens_titles = [ self.an_preprocessor.get_seq_len(seq) for seq in an_title_seqs ] an_lens_bodies = [ self.an_preprocessor.get_seq_len(seq) for seq in an_body_seqs ] # run the ubuntu data forward through the cnn model ub_output_titles = self.run_through_encoder( ub_x_titles, ub_lens_titles) ub_output_bodies = self.run_through_encoder( ub_x_bodies, ub_lens_bodies) if params.use_dom_ad: # run the android data forward through the cnn model an_output_titles = self.run_through_encoder( an_x_titles, an_lens_titles) an_output_bodies = self.run_through_encoder( an_x_bodies, an_lens_bodies) # average the representations ub_out_avg = (ub_output_titles + ub_output_bodies).div(2) if params.use_dom_ad: an_out_avg = (an_output_titles + an_output_bodies).div(2) # now we have the internal feature representations # these features will go to the classifier (just cosine similarity and loss1) # and the features will go through the discriminator network (ending with loss2) # do the classification and loss1 for just the ubuntu data ub_train_instances = torch.chunk(ub_out_avg, len(ub_ids_batch)) ub_cos_scores, ub_targets = self.get_cosine_scores_target_data( ub_train_instances) loss1 = mml(ub_cos_scores, ub_targets) # do discrimination and loss2 for both ubuntu and android if params.use_dom_ad: # concatenate both ub and an both_out_avg = torch.cat([ub_out_avg, an_out_avg]) # run through discriminator out_discr = self.run_through_discr(both_out_avg) # calculate loss2 # print(out_discr.size()) # print(discr_targets.size()) loss2 = bcel(out_discr, discr_targets) # create the total loss total_loss = loss1 - params.lambda_reg * loss2 # now back propagate both optimizers optimizer1.zero_grad() if params.use_dom_ad: optimizer2.zero_grad() total_loss.backward() optimizer1.step() if params.use_dom_ad: optimizer2.step() mod_size = 100.0 i_batch_print = i_batch + 1 if (i_batch_print % mod_size) == 0: print( '---------------------------------------------|------------------|' ) if params.use_dom_ad: print( 'batch %d out of %d . . . loss per batch =| %s |' % (i_batch_print, n_batches, list(total_loss.data)[0])) else: print( 'batch %d out of %d . . . loss per batch =| %s |' % (i_batch_print, n_batches, list(loss1.data)[0])) print( '---------------------------------------------|------------------|' ) print('loss1 = %f' % (list(loss1.data)[0])) if params.use_dom_ad: print('loss2 = %f' % (list(loss2.data)[0])) total_time = time.time() - start_time print('training for %f minutes so far' % (total_time / 60.0)) pred_time = (total_time / i_batch_print) * n_batches / 60.0 print('training on track to take %f minutes' % (pred_time)) last_time = time.time() torch.save(self.encoder_net, params.save_encoder_path) if params.use_dom_ad: torch.save(self.discr_net, params.save_discr_path) if params.use_dom_ad: print('models saved at %s and %s' % (params.save_encoder_path, params.save_discr_path)) else: print('model saved at %s' % (params.save_encoder_path)) print('no discriminator used because params.use_dom_ad == False') print('this means that the android dataset was not used here')
def __init__(self, weight): super(CustomCombinedLoss, self).__init__() self._weight = weight self._criterion_choice = nn.MultiMarginLoss(size_average=False, margin=0.5)
def train_(self, test_desc, group_name=None): """ Training procedure """ p = progressbar.ProgressBar() random.seed(SEED) loss_func_tran = nn.MultiMarginLoss() loss_func_nucl = nn.MultiMarginLoss() loss_func_rel = nn.CrossEntropyLoss() constraint_loss = nn.CrossEntropyLoss() optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE_spinn, weight_decay=l2_penalty) iter_count = batch_count = 0 loss_tran = loss_nucl = loss_rel = loss_nr = 0. p.start(self.skip_steps) pro_idx = 1 for epoch in range(EPOCH_ALL): random.shuffle(self.train_trees) for tree in self.train_trees: self.model.train() iter_count += 1 session_gold = self.session(tree) tran_scores, nucl_scores, rel_scores, nr_scores = None, None, None, None tran_labels, nucl_labels, rel_labels, nr_labels = [], [], [], [] for transition_rel in self.oracle(tree): tran, nucl, rel = self.tran_rel_parser( transition_rel) # 解析 # 1. nucl and rel: scores and labels if rel is not None: if TRAIN_NR: tmp_nr_score = self.model.score_nr(session_gold) nr_scores = self.concat_torch( nr_scores, tmp_nr_score) nr_labels.append(nr2ids[nucl + "-" + rel]) if TRAIN_NUCL_CONSTRAINT: tmp_nucl_score = self.model.score_nucl( session_gold) nucl_scores = self.concat_torch( nucl_scores, tmp_nucl_score) nucl_labels.append(nucl2ids[nucl]) else: tmp_nucl_score = self.model.score_nucl( session_gold) nucl_scores = self.concat_torch( nucl_scores, tmp_nucl_score) nucl_labels.append(nucl2ids[nucl]) tmp_rel_score = self.model.score_rel(session_gold) rel_scores = self.concat_torch( rel_scores, tmp_rel_score) rel_labels.append(coarse2ids[rel]) # 2. tran: scores and labels tmp_tran_score = self.model.score_tran(session_gold) tran_labels.append(action2ids[tran]) tran_scores = self.concat_torch(tran_scores, tmp_tran_score) session_gold, angle_prop_all = self.model( session_gold, transition_rel) loss_tran += loss_func_tran(tran_scores, torch.Tensor(tran_labels).long()) if TRAIN_NR: loss_nr = loss_nr + constraint_loss( nr_scores, torch.Tensor(nr_labels).long()) if TRAIN_NUCL_CONSTRAINT: loss_nucl += loss_func_nucl( nucl_scores, torch.Tensor(nucl_labels).long()) else: loss_nucl += loss_func_nucl( nucl_scores, torch.Tensor(nucl_labels).long()) loss_rel = loss_rel + loss_func_rel( rel_scores, torch.Tensor(rel_labels).long()) # batch learn if iter_count % BATCH_SIZE_spinn == 0 and iter_count > 0: p.update(pro_idx) pro_idx += 1 batch_count += 1 optimizer.zero_grad() loss_tran.backward(retain_graph=True) optimizer.step() if TRAIN_NR: if TRAIN_NUCL_CONSTRAINT: loss_nr = loss_nr + CONSTRAINT_LAMBDA * loss_nucl optimizer.zero_grad() loss_nr.backward() optimizer.step() else: optimizer.zero_grad() loss_nucl.backward(retain_graph=True) optimizer.step() optimizer.zero_grad() loss_rel.backward() optimizer.step() loss_tran, loss_nucl, loss_rel, loss_nr = 0., 0., 0., 0. if batch_count % self.skip_steps == 0: p.finish() better = self.evaluate( trees_eval_path=self.dev_trees_path, type_="dev", save_per=True) if better: self.evaluate(trees_eval_path=self.test_trees_path, type_="test", save_per=False) self.report(epoch, iter_count, test_desc, group_name) if batch_count > self.skip_boundary and self.skip_steps > SKIP_STEP_min: self.skip_steps -= SKIP_REDUCE_UNIT self.skip_boundary += SKIP_BOUNDARY p.start(self.skip_steps) pro_idx = 1
def main(args): data_loader, dataset = get_loaderTrain(args.data_dir, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False, args=args) data_loaderValid, datasetValid = get_loaderValid( args.data_dir, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False, args=args) data_size = dataset.get_data_size() num_classes = dataset.get_num_classes() instance_size = dataset.get_instance_size() # Build the model model = fc_model(input_size=instance_size, num_classes=num_classes, dropout=args.dropout) # create optimizer params = list(model.parameters()) optimizer = torch.optim.Adam(params, betas=(0.9, 0.98), eps=1e-9, lr=args.learning_rate) # multi-class hinge loss label_crit = nn.MultiMarginLoss(reduce=True) model = model.to(device) model.train() print("model created & starting training ...\n\n") # Training script for epoch in range(args.num_epochs): total_correct_preds = 0.0 total = 1e-10 loss = 0.0 # step loop for step, (image_input, class_idxs) in enumerate(data_loader): #print("The size of tensor: ",(image_input.size())) # move all data loaded from dataloader to gpu class_idxs = class_idxs.to(device) image_input = image_input.to(device) # feed-forward data in the model output = model(image_input) # 32 * 150528 --> 32 * 11 # compute losses state_loss = label_crit(output, class_idxs) # --> 32 * 1 # aggregate loss for logging loss += state_loss.item() # back-propagate the loss in the model & optimize model.zero_grad() state_loss.backward() optimizer.step() # accuracy computation _, pred_idx = torch.max(output, dim=1) total_correct_preds += torch.sum(pred_idx == class_idxs).item() total += output.size(0) # epoch accuracy & loss accuracy = round(total_correct_preds / total, 2) loss = round(loss / total, 2) # you can save the model here at specific epochs (ckpt) to load and evaluate the model on the val set print('\repoch {}: accuracy: {}, loss: {}'.format( epoch, accuracy, loss), end="") x = validate(model, data_loaderValid, datasetValid) save_model(model, epoch, optimizer, loss, x) print()
def forward(self, logits, target): criterion = nn.MultiMarginLoss(p=2, margin=0, weight = self.weight, size_average=False) return criterion(logits, target)
['glu', nn.GLU()], ]) loss = nn.ModuleDict( [['l1', nn.L1Loss()], ['nll', nn.NLLLoss()], ['kldiv', nn.KLDivLoss()], ['mse', nn.MSELoss()], ['bce', nn.BCELoss()], ['bce_with_logits', nn.BCEWithLogitsLoss()], ['cosine_embedding', nn.CosineEmbeddingLoss()], ['ctc', nn.CTCLoss()], ['hinge_embedding', nn.HingeEmbeddingLoss()], ['margin_ranking', nn.MarginRankingLoss()], ['multi_label_margin', nn.MultiLabelMarginLoss()], ['multi_label_soft_margin', nn.MultiLabelSoftMarginLoss()], ['multi_margin', nn.MultiMarginLoss()], ['smooth_l1', nn.SmoothL1Loss()], ['soft_margin', nn.SoftMarginLoss()], ['cross_entropy', nn.CrossEntropyLoss()], ['triplet_margin', nn.TripletMarginLoss()], ['poisson_nll', nn.PoissonNLLLoss()]]) optimizer = dict({ 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'sparse_adam': optim.SparseAdam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop,
def __init__(self): super(loss, self).__init__() self.lossrating = tnn.CrossEntropyLoss() self.losscategory = tnn.MultiMarginLoss()
def __init__(self, args): super(SimpleQA, self).__init__() if args.word_pretrained is None: self.word_embedding = nn.Embedding(args.n_words, args.word_dim, args.padding_idx) else: self.word_embedding = nn.Embedding.from_pretrained( args.word_pretrained, freeze=args.freeze) if args.use_gcn: self.gcns = nn.ModuleList() for g in args.relation_graphs: gcn = RGCN(g, args.n_relations, args.sub_relation_dim, args.sub_relation_dim, args.relation_pretrained, args.num_hidden_layers, args.rgcn_dropout, args.norm_type, use_cuda=True) self.gcns.append(gcn) else: if args.relation_pretrained is None: self.relation_embedding = nn.Embedding(args.n_relations, args.relation_dim, args.padding_idx) else: self.relation_embedding = nn.Embedding.from_pretrained( args.relation_pretrained, freeze=False) self.word_encoder = LSTMEncoder(input_size=args.word_dim, hidden_size=args.hidden_dim, num_layers=1, dropout=0.0, batch_first=True, bidirectional=True) self.question_encoder = LSTMEncoder(input_size=2 * args.hidden_dim, hidden_size=args.hidden_dim, num_layers=1, dropout=0.0, batch_first=True, bidirectional=True) self.gate = GateNetwork(2 * args.hidden_dim) self.loss_fn = nn.MultiMarginLoss(margin=args.margin) self.optimizer = torch.optim.Adam(self.parameters(), lr=args.lr) self.ns = args.ns self.score_function = nn.CosineSimilarity(dim=2) self.all_relation_words = args.all_relation_words self.n_relations = args.n_relations self.args = args global global_step global_step = 0
def loss_fn(weight): criterion = nn.MultiMarginLoss(weight=weight) return criterion
def train_model(use_lstm=True): if use_lstm: print_and_write("Training the LSTM model with the GPU:" if USE_GPU else "Training the LSTM model:") else: print_and_write("Training the CNN model with the GPU:" if USE_GPU else "Training the CNN model") get_id_to_text() embeddings = get_word_embeddings() model = LSTMQA(embeddings) if use_lstm else CNNQA(embeddings) if USE_GPU: model.cuda(GPU_NUM) loss_function = nn.MultiMarginLoss( margin=0.2) # TODO: what about size_average? optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) orig_time = time() for epoch in range(NUM_EPOCHS): samples = get_training_data( ) # recalculate this every epoch to get new random selections num_samples = len(samples) num_batches = int(math.ceil(1. * num_samples / BATCH_SIZE)) total_loss = 0 # used for debugging for i in range(num_batches): # Get the samples ready batch = samples[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] # If this is the last batch, then need to pad the batch to get the same shape as expected if i == num_batches - 1 and num_samples % BATCH_SIZE != 0: batch = np.concatenate( (batch, np.full(((i + 1) * BATCH_SIZE - num_samples, 22), "0")), axis=0) # Convert from numpy arrays to tensors title_tensor, title_lengths = get_tensor_from_batch(batch, use_title=True) body_tensor, body_lengths = get_tensor_from_batch(batch, use_title=False) # Reset the model optimizer.zero_grad() # Run our forward pass and get the entire sequence of hidden states model.hidden = model.init_hidden() title_hidden = model(title_tensor) title_encoding = get_encodings(title_hidden, title_lengths, use_lstm=use_lstm) model.hidden = model.init_hidden() body_hidden = model(body_tensor) body_encoding = get_encodings(body_hidden, body_lengths, use_lstm=use_lstm) # Compute loss, gradients, update parameters # Could potentially do something about the last batch, but prolly won't affect training that much X, y = generate_score_matrix(title_encoding, body_encoding) loss = loss_function(X, y) total_loss += loss.data[0] loss.backward() optimizer.step() # every so while, check the dev accuracy # if i % 10 == 0: # print_and_write("For batch number " + str(i) + " it has taken " + str(time() - orig_time) + " seconds and has loss " + str(total_loss)) # if i > 0 and i % 100 == 0: # evaluate_model(model, use_lstm=use_lstm) print_and_write("For epoch number " + str(epoch) + " it has taken " + str(time() - orig_time) + " seconds and has loss " + str(total_loss)) evaluate_model(model, use_lstm=use_lstm) evaluate_model(model, use_test_data=True, use_lstm=use_lstm) if SAVE_MODELS: save_checkpoint(epoch, model, optimizer, use_lstm) #gc.collect() return model
def _init_model(self, states=None): """Initialize model, override to change model setup.""" opt = self.opt kwargs = opt_to_kwargs(opt) self.model = SteroidSeq2seq(len(self.dict), opt['embeddingsize'], opt['hiddensize'], padding_idx=self.NULL_IDX, start_idx=self.START_IDX, longest_label=states.get( 'longest_label', 1), **kwargs) if (opt.get('dict_tokenizer') == 'bpe' and opt['embedding_type'] != 'random'): print('skipping preinitialization of embeddings for bpe') elif not states and opt['embedding_type'] != 'random': # `not states`: only set up embeddings if not loading model self._copy_embeddings(self.model.decoder.lt.weight, opt['embedding_type']) if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self._copy_embeddings(self.model.encoder.lt.weight, opt['embedding_type'], log=False) if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.id = 'SteroidSeq2seq' self.metrics['rank_loss'] = 0.0 self.metrics['total_batches'] = 0.0 self.metrics['overlap'] = 0 self.overlap_count = { 'predicted': 0, 'ranked0': 0, 'ranked1': 0, 'ranked2': 0, 'ranked3': 0, 'ranked4': 0 } # for word overlap numerator self.num_predicted_count = 0 # for word overlap denominator self.injpred_selected_count = 0 self.pred_count = 0 self.iter_cand = opt['iter_cand'] self.count_overlaps = opt['count_overlaps'] self.howtorank = opt['howtorank'] self.min_hamming_dist = opt['min_hamming_dist'] if opt['cand_type'] == 'all': self.cand_type = ['current_labels', 'history'] else: self.cand_type = [opt['cand_type']] self.model.post_ranker = nn.ModuleList() rank_hidden = self.opt.get('rankhiddensize', 512) rank_activation = getattr(nn, self.opt['rank_activation']) self.model.post_ranker.append( nn.Linear(self.opt['hiddensize'], rank_hidden)) self.model.post_ranker.append(rank_activation()) for i in range(self.opt.get('ranknl', 2) - 2): self.model.post_ranker.append(nn.Linear(rank_hidden, rank_hidden)) self.model.post_ranker.append(rank_activation()) self.model.post_ranker.append(nn.Linear(rank_hidden, 1)) if states: # set loaded states if applicable self.model.load_state_dict(states['model'], strict=self.opt['strict_load']) if self.use_cuda: self.model.cuda() if self.opt['rankloss'] == 'margin': self.rank_criterion = nn.MultiMarginLoss( margin=self.opt['margin'], reduction=self.opt['ranklossreduce']) elif self.opt['rankloss'] == 'ce': self.rank_criterion = nn.CrossEntropyLoss( reduction=self.opt['ranklossreduce']) self.inject = self.opt['dump_all_preds'] assert self.opt.get( 'person_tokens', False) is True, 'We extract past labels using person tokens'
def mml_class_loss(pred_class, gt_class): _loss = nn.MultiMarginLoss(reduce=False) labels = gt_class.nonzero()[:, 1] l = _loss(pred_class, labels) l = l.mean() return l