def train(num_epochs, config, data_loader, multigpu=False): def maybe_evaluate(model, epoch, prev_best, prev_best_acc): best_model = prev_best best_test_acc = prev_best_acc test_acc = None if epoch % 100 == 99: ooo_acc = evaluate(model, ooo_loader) test_acc = evaluate(model, test_loader) print('epoch {} test: {:.2f}; ooo: {:.2f}'.format( epoch, test_acc, ooo_acc)) if test_acc > prev_best_acc: best_test_acc = test_acc best_model = model print('saving new model') torch.save(best_model, 'best.model.testing') return best_model, best_test_acc, test_acc def maybe_report_time(): if False and epoch % 100 == 0 and epoch > 0: finish_time = time.clock() time_per_epoch = (finish_time - start_time) / epoch print('Average time per epoch: {:.2} sec'.format(time_per_epoch)) puzzle_gen = config.create_puzzle_generator() ooo_dataset = OddOneOutDataset(puzzle_gen, 5, 'data/ooo/living.tsv') ooo_loader = OddOneOutDataloader(ooo_dataset).get_loaders()[0] start_time = time.clock() net_factory = config.create_network_factory() model = net_factory(data_loader.input_size(), data_loader.output_size()) if multigpu and torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") #dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = nn.DataParallel(model) model = cudaify(model) loss_function = nn.NLLLoss() optimizer = config.create_optimizer_factory()(model.parameters()) best_model = None best_test_acc = -1.0 scores = [] for epoch in range(num_epochs): model.train() model.zero_grad() loader, test_loader = data_loader.get_loaders(epoch) for data, response in loader: input_matrix = cudaify(data) log_probs = model(input_matrix) loss = loss_function(log_probs, cudaify(response)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() best_model, best_test_acc, test_acc = maybe_evaluate( model, epoch, best_model, best_test_acc) if test_acc is not None: scores.append((epoch, test_acc)) if best_test_acc >= .98: break maybe_report_time() return best_model, scores
def train(num_epochs, config, data_loader, multigpu=False): def maybe_evaluate(prev_best, prev_best_acc): best = prev_best best_accuracy = prev_best_acc test_accuracy = None if epoch % 100 == 99: test_accuracy = evaluate(model, test_loader) print('epoch {} test: {:.2f}'.format(epoch, test_accuracy)) if test_accuracy > prev_best_acc: best_accuracy = test_accuracy best = model print('saving new model') torch.save(best, 'best.model.testing') return best, best_accuracy, test_accuracy def maybe_report_time(): if False and epoch % 100 == 0 and epoch > 0: finish_time = time.process_time() time_per_epoch = (finish_time - start_time) / epoch print('Average time per epoch: {:.2} sec'.format(time_per_epoch)) start_time = time.process_time() print('epoch: {}'.format(epoch)) net_factory = config.create_network_factory() model = net_factory(data_loader.input_size(), data_loader.output_size()) if multigpu and torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = nn.DataParallel(model) model = cudaify(model) loss_function = nn.NLLLoss() optimizer = config.create_optimizer_factory()(model.parameters()) best_model = None best_test_acc = -1.0 scores = [] for epoch in range(num_epochs): model.train() model.zero_grad() loader, test_loader = data_loader.get_loaders(epoch) for data, response in loader: input_matrix = cudaify(data) torch.set_printoptions(profile="full") log_probs = model(input_matrix) loss = loss_function(log_probs, cudaify(response)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() best_model, best_test_acc, test_acc = maybe_evaluate(best_model, best_test_acc) if test_acc is not None: scores.append((epoch, test_acc)) if best_test_acc >= .95: break maybe_report_time() return best_model, scores
def make_puzzle_matrix(self, tok_puzzles): ''' concatenate first 4 tokens if exist, then merge the rest tokens and append it to the end TODO: Is it possible to get rid of the topmost for-loop using torch tensor ops?? ''' matrix = [] for tok_puzzle in tok_puzzles: choices, _ = tok_puzzle oneHotVec = [] for choice in choices: choice_Vec_list = [one_hot(tok, self.vocab) for tok in choice] if len(choice_Vec_list) > (self.num_tok - 1): choice_Vec_list[self.num_tok - 1] = [ sum(vec) for vec in zip(*choice_Vec_list[self.num_tok - 1:]) ] choice_Vec_list = choice_Vec_list[:self.num_tok] result = [tok for word in choice_Vec_list for tok in word] appendix = [0] * (self.num_tok * len(self.vocab) - len(result)) oneHotVec += result + appendix matrix.append(oneHotVec) result = cudaify(FloatTensor(matrix)) return result
def predict(model, input_tensor): with torch.no_grad(): model.eval() input_matrix = cudaify(input_tensor) log_probs = model(input_matrix) predictions = log_probs.argmax(dim=1) return predictions
def make_puzzle_matrix(self, puzzles): matrix = [] for puzzle in puzzles: choices, _ = puzzle oneHotVec = [] for choice in choices: oneHotVec += one_hot(str(choice), self.get_vocab()) matrix.append(oneHotVec) return cudaify(FloatTensor(matrix))
def make_puzzle_targets(labels): return cudaify(LongTensor(labels))
def make_puzzle_target(label): return cudaify(LongTensor([label]))
def make_puzzle_vector(puzzle, vocab): choices, _ = puzzle oneHotVec = [] for choice in choices: oneHotVec += one_hot(str(choice), vocab) return cudaify(FloatTensor(oneHotVec).view(1, -1))
def compare_tensors(self, t1, t2): t1 = cudaify(t1) t2 = cudaify(t2) assert t1.shape == t2.shape assert torch.allclose(t1, t2)