def train(features, fea_len, split_frac, out_file, save=False, save_folder=None): ''' hyperparameters: features amount of training data feature length ''' if isinstance(out_file, str): out_file = open(out_file, 'w') d = Dataset(features, split_frac, 1, gpu) print 'defining architecture' enc = ChainEncoder(d.get_v_fea_len(), d.get_e_fea_len(), fea_len, 'last') predictor = Predictor(fea_len) loss = nn.NLLLoss() if gpu: enc.cuda() predictor.cuda() loss.cuda() optimizer = optim.Adam( list(enc.parameters()) + list(predictor.parameters())) print 'training' test_v_features, test_e_features, test_A_pls, test_B_pls, test_y = d.get_test_pairs( ) test_y = test_y.data.cpu().numpy() for train_iter in xrange(12000): v_features, e_features, A_pls, B_pls, y = d.get_train_pairs(100) enc.zero_grad() predictor.zero_grad() A_code, B_code = encode(enc, fea_len, v_features, e_features, A_pls, B_pls) softmax_output = predictor(A_code, B_code) loss_val = loss(softmax_output, y) loss_val.backward() optimizer.step() enc.zero_grad() predictor.zero_grad() test_A_code, test_B_code = encode(enc, fea_len, test_v_features, test_e_features, test_A_pls, test_B_pls) softmax_output = predictor(test_A_code, test_B_code).data.cpu().numpy() test_y_pred = softmax_output.argmax(axis=1) cur_acc = (test_y_pred == test_y).sum() / len(test_y) out_file.write('%f\n' % cur_acc) out_file.flush() if save and train_iter % 50 == 0: if save_folder[-1] == '/': save_folder = save_folder[:-1] torch.save(enc.state_dict(), '%s/%i_enc.model' % (save_folder, train_iter)) torch.save(predictor.state_dict(), '%s/%i_pred.model' % (save_folder, train_iter)) out_file.close()
def train(features, fea_len, split_frac, out_file): if isinstance(out_file, str): out_file = open(out_file, 'w') d = Dataset(features, split_frac, gpu) print 'defining architecture' enc = ChainEncoder(d.get_v_fea_len(), d.get_e_fea_len(), fea_len, 'last') predictor = Predictor(fea_len) loss = nn.NLLLoss() if gpu: enc.cuda() predictor.cuda() loss.cuda() optimizer = optim.Adam( list(enc.parameters()) + list(predictor.parameters())) print 'training' test_chain_A, test_chain_B, test_y = d.get_test_pairs() test_y = test_y.data.cpu().numpy() for train_iter in xrange(4000): chains_A, chains_B, y = d.get_train_pairs(1000) enc.zero_grad() predictor.zero_grad() output_A = enc(chains_A) output_B = enc(chains_B) softmax_output = predictor(output_A, output_B) loss_val = loss(softmax_output, y) loss_val.backward() optimizer.step() enc.zero_grad() predictor.zero_grad() output_test_A = enc(test_chain_A) output_test_B = enc(test_chain_B) softmax_output = predictor(output_test_A, output_test_B).data.cpu().numpy() test_y_pred = softmax_output.argmax(axis=1) cur_acc = (test_y_pred == test_y).sum() / len(test_y) print 'test acc:', cur_acc out_file.write('%f\n' % cur_acc) if train_iter % 50 == 0: torch.save(enc.state_dict(), 'ckpt/%i_encoder.model' % train_iter) torch.save(predictor.state_dict(), 'ckpt/%i_predictor.model' % train_iter) out_file.close()
best_test3_acc = 0.0 best_epoch_num = 0 total_epoch_num = 0 all_losses = [] all_acc_1 = [] all_acc_2 = [] all_acc_3 = [] for epoch in range(1, num_epochs): total_epoch_num += 1 shuffled_id_blocks = get_shuffled_ids(_data['tr'], batch_size) running_loss = 0.0 predictor.train() for id_block in shuffled_id_blocks: predictor.zero_grad() h0 = torch.zeros(num_of_layers * num_of_directions, id_block.shape[0], lstm_dim) c0 = torch.zeros(num_of_layers * num_of_directions, id_block.shape[0], lstm_dim) batch_input, batch_len, batch_label = make_batch( _data['tr'], _label['tr'], id_block) output = predictor(batch_input, batch_len, h0, c0) loss = criterion(output, batch_label) running_loss += loss.item() * batch_input.size(0) loss.backward() _ = torch.nn.utils.clip_grad_norm_(predictor.parameters(), clip) optimizer.step() running_loss = running_loss / _data['tr'].shape[0]