def train(self, emails, val_data, w2v, epochs=10, save_model=True): optimizer = optim.RMSprop(self.parameters(), lr=1e-3, alpha=0.99, momentum=0.0) for epoch in range(epochs): epoch_loss = 0.0 start = time.time() # loop over each mail for i in range(len(emails)): optimizer.zero_grad() loss, valid = self.predict(emails[i, :], w2v) if valid: # propagate the loss backward and compute the gradient loss.backward() # change weights based on gradient value optimizer.step() epoch_loss += loss.data.numpy() end = time.time() print 'time taken for epoch:', (end - start) print 'loss in epoch ' + str(epoch) + ' = ' + str(epoch_loss) if save_model: file_name = constants.RUN_ID + '_model.pth' self.save(file_name) email_ids, embs = self.extract_user_embeddings() utils.save_user_embeddings(email_ids, embs) plots.plot_with_tsne(email_ids, embs, display_hover=False)
def train(self, emails, w2v, epochs=10, save_model=True): loss_criteria = nn.MSELoss() optimizer = optim.RMSprop(self.parameters(), lr=0.0001, alpha=0.99, momentum=0.0) # optimizer = optim.Adam(self.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) email_reps = w2v.get_email_reps(emails, average=True) for epoch in range(epochs): print 'running epoch ', epoch start = time.time() epoch_loss = 0.0 for i in range(len(emails)): sender_id = utils.get_userid(emails[i, constants.SENDER_EMAIL]) # if no word_rep was found for any of the words in the emails, ignore this case if type(email_reps[i]) == type(None): continue # gets the average email embedding based on word embeddings of all the words in the mail email_rep = email_reps[i] recv_list = emails[i, constants.RECEIVER_EMAILS].split('|') for recv in recv_list: optimizer.zero_grad() recv_id = utils.get_userid(recv) # if sender or receiver is not an enron email id, we ignore this data point if sender_id is None or recv_id is None: continue # if valid sender and receiver pairs have been found update their frequencies self.emailid_train_freq[emails[ i, constants.SENDER_EMAIL]] = self.emailid_train_freq.get( emails[i, constants.SENDER_EMAIL], 0) + 1 self.emailid_train_freq[ recv] = self.emailid_train_freq.get(recv, 0) + 1 # do the forward pass pred_email_rep = self.forward( autograd.Variable(torch.LongTensor([sender_id])), autograd.Variable(torch.LongTensor([recv_id]))) # compute the loss loss = loss_criteria( pred_email_rep, autograd.Variable(torch.from_numpy(email_rep))) # propagate the loss backward and compute the gradient loss.backward() # change weights based on gradient value optimizer.step() epoch_loss += loss.data.numpy() end = time.time() print 'time taken ', (end - start) print 'loss in epoch ' + str(epoch) + ' = ' + str(epoch_loss) if save_model: file_name = constants.RUN_ID + '_model.pth' self.save(file_name) email_ids, embs = self.extract_user_embeddings() utils.save_user_embeddings(email_ids, embs) # utils.get_similar_users(email_ids, embs) plots.plot_with_tsne(email_ids, embs, display_hover=False)
def train(self, emails, val_data, w2v, epochs=10, save_model=True): optimizer = optim.RMSprop(self.parameters(), lr=0.001, alpha=0.99, momentum=0.0) pos_label = autograd.Variable(torch.LongTensor( [1])) # labels for correct mails neg_label = autograd.Variable(torch.LongTensor( [0])) # labels for incorrect mails neg_emails = dal.get_negative_emails(emails, fraction=1.0) for epoch in range(epochs): epoch_loss = 0.0 start = time.time() for i in range(len(emails)): optimizer.zero_grad() loss, valid = self.predict(emails[i, :], w2v, label=pos_label, training_mode=True) if valid: # propagate the loss backward and compute the gradient loss.backward() # change weights based on gradient value optimizer.step() epoch_loss += loss.data.numpy() optimizer.zero_grad() loss, valid = self.predict(neg_emails[i, :], w2v, label=neg_label, training_mode=True) if valid: # propagate the loss backward and compute the gradient loss.backward() # change weights based on gradient value optimizer.step() epoch_loss += loss.data.numpy() end = time.time() print 'time taken for epoch : ', (end - start) print 'loss in epoch ' + str(epoch) + ' = ' + str(epoch_loss) if save_model: file_name = constants.RUN_ID + '_model.pth' self.save(file_name) email_ids, embs = self.extract_user_embeddings() utils.save_user_embeddings(email_ids, embs) # utils.get_similar_users(email_ids, embs) plots.plot_with_tsne(email_ids, embs, display_hover=False)