def test_backward(self): cost_func = NegativeSampling() optimizer = torch.optim.SGD(self.model.parameters(), lr=0.001) for _ in range(2): x = self.model.forward( self.context_ids, self.doc_ids, self.target_noise_ids) x = cost_func.forward(x) self.model.zero_grad() x.backward() optimizer.step() self.assertEqual(torch.sum(self.model._D.grad[0, :].data), 0) self.assertNotEqual(torch.sum(self.model._D.grad[1, :].data), 1) self.assertNotEqual(torch.sum(self.model._D.grad[1, :].data), 2) context_ids = self.context_ids.numpy().flatten() target_noise_ids = self.target_noise_ids.numpy().flatten() for word_id in range(11): if word_id in context_ids: self.assertNotEqual( torch.sum(self.model._W.grad[word_id, :].data), 0) else: self.assertEqual( torch.sum(self.model._W.grad[word_id, :].data), 0) if word_id in target_noise_ids: self.assertNotEqual( torch.sum(self.model._O.grad[:, word_id].data), 0) else: self.assertEqual( torch.sum(self.model._O.grad[:, word_id].data), 0)
def test_backward(self): cost_func = NegativeSampling() optimizer = torch.optim.SGD(self.model.parameters(), lr=0.001) for _ in range(2): x = self.model.forward( self.context_ids, self.doc_ids, self.target_noise_ids) x = cost_func.forward(x) self.model.zero_grad() x.backward() optimizer.step() self.assertEqual(torch.sum(self.model._D.grad[0, :].data), 0) self.assertNotEqual(torch.sum(self.model._D.grad[1, :].data), 0) self.assertNotEqual(torch.sum(self.model._D.grad[2, :].data), 0) context_ids = self.context_ids.numpy().flatten() target_noise_ids = self.target_noise_ids.numpy().flatten() for word_id in range(15): if word_id in context_ids: self.assertNotEqual( torch.sum(self.model._W.grad[word_id, :].data), 0) else: self.assertEqual( torch.sum(self.model._W.grad[word_id, :].data), 0) if word_id in target_noise_ids: self.assertNotEqual( torch.sum(self.model._O.grad[:, word_id].data), 0) else: self.assertEqual( torch.sum(self.model._O.grad[:, word_id].data), 0)
class NegativeSamplingTest(TestCase): def setUp(self): self.loss_f = NegativeSampling() def test_forward(self): # todo: test actual value scores = torch.FloatTensor([[12.1, 1.3, 6.5], [18.9, 2.1, 9.4]]) loss = self.loss_f.forward(scores) self.assertTrue(loss.data[0] >= 0)
def _run(data_processor, data_file_name, dataset, data_generator, num_batches, vocabulary_size, number_examples, context_size, num_noise_words, vec_dim, num_epochs, batch_size, lr, model_ver, vec_combine_method, save_all): ''' Averagely, the time consumption: max_generated_batches = 5 CPU: backward time: 600~650 ms sampling time: 1 ms forward time: 5~7 ms GPU: backward time: 3 ms sampling time: 72 ms forward time: 1~2 ms Should rewrite sampling to speed up on GPU DocTag2Vec on CPU: 121882 words/s, 8 workers processing one document time = 650~850 ms training on 173403030 raw words (68590824 effective words) took 646.2s, 106138 effective words/s Data Generation, the major bottleneck is still generation, maybe due to the lock: GPU (Desktop) generating batch time: 1200~2001 ms, (1508425387839, 1508425389840) transfer batch to Torch: 1 ms, (1508425389840, 1508425389841) #worker = 1: 300~600 words/s #worker = 8: 600~4000 words/s (around 2500 often) After changing to torch.sampler, getting worse, data-prepare time is not stable CPU (Mac) #worker = 8: generating batch time: 1200~1527 ms, (1508424953768, 1508424955295) transfer batch to Torch: 1 ms, (1508424955295, 1508424955296) Generating one example time: 2~5 ms, (1508458881118, 1508458881122) Generating one document time: 50~400 ms, (1508458881118, 1508458881122) Generating one batch time: 650~700 ms, (1508458880690, 1508458881122) After changing to torch.sampler Generating one example time: 4~7 ms Generating one batch time: 900~1200 ms ''' model = DistributedMemory(vec_dim, num_docs=len(dataset), num_words=vocabulary_size) cost_func = NegativeSampling() optimizer = Adam(params=model.parameters(), lr=lr) logger = logging.getLogger('root') if torch.cuda.is_available(): model.cuda() logger.info("Running on GPU - CUDA") else: logger.info("Running on CPU") logger.info("Dataset comprised of {:d} documents.".format(len(dataset))) logger.info("Vocabulary size is {:d}.\n".format(vocabulary_size)) logger.info("Training started.") best_loss = float_info.max prev_model_file_path = "" progbar = Progbar(num_batches, batch_size=batch_size, total_examples=number_examples) for epoch_i in range(num_epochs): epoch_start_time = time.time() loss = [] for batch_i in range(num_batches): start_time = current_milli_time() batch = next(data_generator) current_time = current_milli_time() print('data-prepare time: %d ms' % (round(current_time - start_time))) start_time = current_milli_time() x = model.forward(batch.context_ids, batch.doc_ids, batch.target_noise_ids) x = cost_func.forward(x) loss.append(x.data[0]) print('forward time: %d ms' % round(current_milli_time() - start_time)) start_time = current_milli_time() model.zero_grad() x.backward() optimizer.step() print('backward time: %d ms' % round(current_milli_time() - start_time)) progbar.update( epoch_i, batch_i, ) # _print_progress(epoch_i, batch_i, num_batches) # end of epoch loss = torch.mean(torch.FloatTensor(loss)) is_best_loss = loss < best_loss best_loss = min(loss, best_loss) progbar.update(epoch_i, batch_i, [('loss', loss), ('best_loss', best_loss)]) model_file_name = MODEL_NAME.format(data_file_name[:-4], model_ver, vec_combine_method, context_size, num_noise_words, vec_dim, batch_size, lr, epoch_i + 1, loss) model_file_path = join(MODELS_DIR, model_file_name) if not os.path.exists(MODELS_DIR): os.makedirs(MODELS_DIR) state = { 'epoch': epoch_i + 1, 'model_state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer_state_dict': optimizer.state_dict() } if save_all: torch.save(state, model_file_path) elif is_best_loss: try: remove(prev_model_file_path) except FileNotFoundError: pass torch.save(state, model_file_path) prev_model_file_path = model_file_path epoch_total_time = round(time.time() - epoch_start_time) logger.info(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
def _run(data_processor, data_file_name, dataset, data_generator, num_batches, vocabulary_size, number_examples, context_size, num_noise_words, vec_dim, num_epochs, batch_size, lr, model_ver, vec_combine_method, save_all): ''' Averagely, the time consumption: max_generated_batches = 5 CPU: backward time: 600~650 ms sampling time: 1 ms forward time: 5~7 ms GPU: backward time: 3 ms sampling time: 72 ms forward time: 1~2 ms Should rewrite sampling to speed up on GPU DocTag2Vec on CPU: 121882 words/s, 8 workers processing one document time = 650~850 ms training on 173403030 raw words (68590824 effective words) took 646.2s, 106138 effective words/s Data Generation, the major bottleneck is still generation, maybe due to the lock: GPU (Desktop) generating batch time: 1200~2001 ms, (1508425387839, 1508425389840) transfer batch to Torch: 1 ms, (1508425389840, 1508425389841) #worker = 1: 300~600 words/s #worker = 8: 600~4000 words/s (around 2500 often) After changing to torch.sampler, getting worse, data-prepare time is not stable CPU (Mac) #worker = 8: generating batch time: 1200~1527 ms, (1508424953768, 1508424955295) transfer batch to Torch: 1 ms, (1508424955295, 1508424955296) Generating one example time: 2~5 ms, (1508458881118, 1508458881122) Generating one document time: 50~400 ms, (1508458881118, 1508458881122) Generating one batch time: 650~700 ms, (1508458880690, 1508458881122) After changing to torch.sampler Generating one example time: 4~7 ms Generating one batch time: 900~1200 ms ''' model = DistributedMemory( vec_dim, num_docs=len(dataset), num_words=vocabulary_size) cost_func = NegativeSampling() optimizer = Adam(params=model.parameters(), lr=lr) logger = logging.getLogger('root') if torch.cuda.is_available(): model.cuda() logger.info("Running on GPU - CUDA") else: logger.info("Running on CPU") logger.info("Dataset comprised of {:d} documents.".format(len(dataset))) logger.info("Vocabulary size is {:d}.\n".format(vocabulary_size)) logger.info("Training started.") best_loss = float_info.max prev_model_file_path = "" progbar = Progbar(num_batches, batch_size=batch_size, total_examples = number_examples) for epoch_i in range(num_epochs): epoch_start_time = time.time() loss = [] for batch_i in range(num_batches): start_time = current_milli_time() batch = next(data_generator) current_time = current_milli_time() print('data-prepare time: %d ms' % (round(current_time - start_time))) start_time = current_milli_time() x = model.forward( batch.context_ids, batch.doc_ids, batch.target_noise_ids) x = cost_func.forward(x) loss.append(x.data[0]) print('forward time: %d ms' % round(current_milli_time() - start_time)) start_time = current_milli_time() model.zero_grad() x.backward() optimizer.step() print('backward time: %d ms' % round(current_milli_time() - start_time)) progbar.update(epoch_i, batch_i, ) # _print_progress(epoch_i, batch_i, num_batches) # end of epoch loss = torch.mean(torch.FloatTensor(loss)) is_best_loss = loss < best_loss best_loss = min(loss, best_loss) progbar.update(epoch_i, batch_i, [('loss', loss), ('best_loss', best_loss)]) model_file_name = MODEL_NAME.format( data_file_name[:-4], model_ver, vec_combine_method, context_size, num_noise_words, vec_dim, batch_size, lr, epoch_i + 1, loss) model_file_path = join(MODELS_DIR, model_file_name) if not os.path.exists(MODELS_DIR): os.makedirs(MODELS_DIR) state = { 'epoch': epoch_i + 1, 'model_state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer_state_dict': optimizer.state_dict() } if save_all: torch.save(state, model_file_path) elif is_best_loss: try: remove(prev_model_file_path) except FileNotFoundError: pass torch.save(state, model_file_path) prev_model_file_path = model_file_path epoch_total_time = round(time.time() - epoch_start_time) logger.info(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
def _run(data_file_name, dataset, data_generator, num_batches, vocabulary_size, context_size, num_noise_words, vec_dim, num_epochs, batch_size, lr, model_ver, vec_combine_method, save_all, generate_plot, model_ver_is_dbow): if model_ver_is_dbow: model = DBOW(vec_dim, num_docs=len(dataset), num_words=vocabulary_size) else: model = DM(vec_dim, num_docs=len(dataset), num_words=vocabulary_size) cost_func = NegativeSampling() optimizer = Adam(params=model.parameters(), lr=lr) if torch.cuda.is_available(): model.cuda() print("Dataset comprised of {:d} documents.".format(len(dataset))) print("Vocabulary size is {:d}.\n".format(vocabulary_size)) print("Training started.") best_loss = float("inf") prev_model_file_path = None for epoch_i in range(num_epochs): epoch_start_time = time.time() loss = [] for batch_i in range(num_batches): batch = next(data_generator) if torch.cuda.is_available(): batch.cuda_() if model_ver_is_dbow: x = model.forward(batch.doc_ids, batch.target_noise_ids) else: x = model.forward(batch.context_ids, batch.doc_ids, batch.target_noise_ids) x = cost_func.forward(x) loss.append(x.item()) model.zero_grad() x.backward() optimizer.step() _print_progress(epoch_i, batch_i, num_batches) # end of epoch loss = torch.mean(torch.FloatTensor(loss)) is_best_loss = loss < best_loss best_loss = min(loss, best_loss) state = { 'epoch': epoch_i + 1, 'model_state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer_state_dict': optimizer.state_dict() } prev_model_file_path = save_training_state( data_file_name, model_ver, vec_combine_method, context_size, num_noise_words, vec_dim, batch_size, lr, epoch_i, loss, state, save_all, generate_plot, is_best_loss, prev_model_file_path, model_ver_is_dbow) epoch_total_time = round(time.time() - epoch_start_time) print(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
def _run(data_file_name, dataset, data_generator, num_batches, vocabulary_size, context_size, num_noise_words, vec_dim, num_epochs, batch_size, lr, model_ver, vec_combine_method, save_all): model = DistributedMemory( vec_dim, num_docs=len(dataset), num_words=vocabulary_size) cost_func = NegativeSampling() optimizer = SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True) if torch.cuda.is_available(): model.cuda() print("Dataset comprised of {:d} documents.".format(len(dataset))) print("Vocabulary size is {:d}.\n".format(vocabulary_size)) print("Training started.") best_loss = float_info.max prev_model_file_path = "" for epoch_i in range(num_epochs): epoch_start_time = time.time() loss = [] for batch_i in range(num_batches): batch = next(data_generator) x = model.forward( batch.context_ids, batch.doc_ids, batch.target_noise_ids) x = cost_func.forward(x) loss.append(x.data[0]) model.zero_grad() x.backward() optimizer.step() _print_progress(epoch_i, batch_i, num_batches) # end of epoch loss = torch.mean(torch.FloatTensor(loss)) is_best_loss = loss < best_loss best_loss = min(loss, best_loss) model_file_name = MODEL_NAME.format( data_file_name[:-4], model_ver, vec_combine_method, context_size, num_noise_words, vec_dim, batch_size, lr, epoch_i + 1, loss) model_file_path = join(MODELS_DIR, model_file_name) state = { 'epoch': epoch_i + 1, 'model_state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer_state_dict': optimizer.state_dict() } if save_all: torch.save(state, model_file_path) elif is_best_loss: try: remove(prev_model_file_path) except FileNotFoundError: pass torch.save(state, model_file_path) prev_model_file_path = model_file_path epoch_total_time = round(time.time() - epoch_start_time) print(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
def setUp(self): self.loss_f = NegativeSampling()