def test_backward(self):
        cost_func = NegativeSampling()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=0.001)
        for _ in range(2):
            x = self.model.forward(
                self.context_ids, self.doc_ids, self.target_noise_ids)
            x = cost_func.forward(x)
            self.model.zero_grad()
            x.backward()
            optimizer.step()

        self.assertEqual(torch.sum(self.model._D.grad[0, :].data), 0)
        self.assertNotEqual(torch.sum(self.model._D.grad[1, :].data), 1)
        self.assertNotEqual(torch.sum(self.model._D.grad[1, :].data), 2)

        context_ids = self.context_ids.numpy().flatten()
        target_noise_ids = self.target_noise_ids.numpy().flatten()

        for word_id in range(11):
            if word_id in context_ids:
                self.assertNotEqual(
                    torch.sum(self.model._W.grad[word_id, :].data), 0)
            else:
                self.assertEqual(
                    torch.sum(self.model._W.grad[word_id, :].data), 0)

            if word_id in target_noise_ids:
                self.assertNotEqual(
                    torch.sum(self.model._O.grad[:, word_id].data), 0)
            else:
                self.assertEqual(
                    torch.sum(self.model._O.grad[:, word_id].data), 0)
Beispiel #2
0
    def test_backward(self):
        cost_func = NegativeSampling()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=0.001)
        for _ in range(2):
            x = self.model.forward(
                self.context_ids, self.doc_ids, self.target_noise_ids)
            x = cost_func.forward(x)
            self.model.zero_grad()
            x.backward()
            optimizer.step()

        self.assertEqual(torch.sum(self.model._D.grad[0, :].data), 0)
        self.assertNotEqual(torch.sum(self.model._D.grad[1, :].data), 0)
        self.assertNotEqual(torch.sum(self.model._D.grad[2, :].data), 0)

        context_ids = self.context_ids.numpy().flatten()
        target_noise_ids = self.target_noise_ids.numpy().flatten()

        for word_id in range(15):
            if word_id in context_ids:
                self.assertNotEqual(
                    torch.sum(self.model._W.grad[word_id, :].data), 0)
            else:
                self.assertEqual(
                    torch.sum(self.model._W.grad[word_id, :].data), 0)

            if word_id in target_noise_ids:
                self.assertNotEqual(
                    torch.sum(self.model._O.grad[:, word_id].data), 0)
            else:
                self.assertEqual(
                    torch.sum(self.model._O.grad[:, word_id].data), 0)
Beispiel #3
0
class NegativeSamplingTest(TestCase):
    def setUp(self):
        self.loss_f = NegativeSampling()

    def test_forward(self):
        # todo: test actual value
        scores = torch.FloatTensor([[12.1, 1.3, 6.5], [18.9, 2.1, 9.4]])
        loss = self.loss_f.forward(scores)
        self.assertTrue(loss.data[0] >= 0)
Beispiel #4
0
def _run(data_processor, data_file_name, dataset, data_generator, num_batches,
         vocabulary_size, number_examples, context_size, num_noise_words,
         vec_dim, num_epochs, batch_size, lr, model_ver, vec_combine_method,
         save_all):
    '''
    Averagely, the time consumption:
    max_generated_batches = 5
        CPU:
            backward time: 600~650 ms
            sampling time: 1 ms
            forward time:  5~7 ms
        GPU:
            backward time: 3 ms
            sampling time: 72 ms
            forward time:  1~2 ms
    Should rewrite sampling to speed up on GPU

    DocTag2Vec on CPU:
        121882 words/s, 8 workers
        processing one document time = 650~850 ms
        training on 173403030 raw words (68590824 effective words) took 646.2s, 106138 effective words/s

    Data Generation, the major bottleneck is still generation, maybe due to the lock:
        GPU (Desktop)
            generating batch time: 1200~2001 ms, (1508425387839, 1508425389840)
            transfer batch to Torch: 1 ms, (1508425389840, 1508425389841)
            #worker = 1: 300~600 words/s
            #worker = 8: 600~4000 words/s (around 2500 often)
            After changing to torch.sampler, getting worse, data-prepare time is not stable
        CPU (Mac)
            #worker = 8:
                generating batch time: 1200~1527 ms, (1508424953768, 1508424955295)
                transfer batch to Torch: 1 ms, (1508424955295, 1508424955296)
                Generating one example time: 2~5 ms, (1508458881118, 1508458881122)
                Generating one document time: 50~400 ms, (1508458881118, 1508458881122)
                Generating one batch time: 650~700 ms, (1508458880690, 1508458881122)
            After changing to torch.sampler
                Generating one example time: 4~7 ms
                Generating one batch time: 900~1200 ms

    '''

    model = DistributedMemory(vec_dim,
                              num_docs=len(dataset),
                              num_words=vocabulary_size)

    cost_func = NegativeSampling()
    optimizer = Adam(params=model.parameters(), lr=lr)
    logger = logging.getLogger('root')

    if torch.cuda.is_available():
        model.cuda()
        logger.info("Running on GPU - CUDA")
    else:
        logger.info("Running on CPU")

    logger.info("Dataset comprised of {:d} documents.".format(len(dataset)))
    logger.info("Vocabulary size is {:d}.\n".format(vocabulary_size))
    logger.info("Training started.")

    best_loss = float_info.max
    prev_model_file_path = ""

    progbar = Progbar(num_batches,
                      batch_size=batch_size,
                      total_examples=number_examples)

    for epoch_i in range(num_epochs):
        epoch_start_time = time.time()
        loss = []

        for batch_i in range(num_batches):
            start_time = current_milli_time()
            batch = next(data_generator)
            current_time = current_milli_time()
            print('data-prepare time: %d ms' %
                  (round(current_time - start_time)))

            start_time = current_milli_time()
            x = model.forward(batch.context_ids, batch.doc_ids,
                              batch.target_noise_ids)
            x = cost_func.forward(x)
            loss.append(x.data[0])
            print('forward time: %d ms' %
                  round(current_milli_time() - start_time))

            start_time = current_milli_time()
            model.zero_grad()
            x.backward()
            optimizer.step()
            print('backward time: %d ms' %
                  round(current_milli_time() - start_time))

            progbar.update(
                epoch_i,
                batch_i,
            )
            # _print_progress(epoch_i, batch_i, num_batches)

        # end of epoch
        loss = torch.mean(torch.FloatTensor(loss))
        is_best_loss = loss < best_loss
        best_loss = min(loss, best_loss)
        progbar.update(epoch_i, batch_i, [('loss', loss),
                                          ('best_loss', best_loss)])

        model_file_name = MODEL_NAME.format(data_file_name[:-4], model_ver,
                                            vec_combine_method, context_size,
                                            num_noise_words, vec_dim,
                                            batch_size, lr, epoch_i + 1, loss)
        model_file_path = join(MODELS_DIR, model_file_name)
        if not os.path.exists(MODELS_DIR):
            os.makedirs(MODELS_DIR)
        state = {
            'epoch': epoch_i + 1,
            'model_state_dict': model.state_dict(),
            'best_loss': best_loss,
            'optimizer_state_dict': optimizer.state_dict()
        }
        if save_all:
            torch.save(state, model_file_path)
        elif is_best_loss:
            try:
                remove(prev_model_file_path)
            except FileNotFoundError:
                pass
            torch.save(state, model_file_path)
            prev_model_file_path = model_file_path

        epoch_total_time = round(time.time() - epoch_start_time)
        logger.info(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
Beispiel #5
0
def _run(data_processor,
         data_file_name,
         dataset,
         data_generator,
         num_batches,
         vocabulary_size,
         number_examples,
         context_size,
         num_noise_words,
         vec_dim,
         num_epochs,
         batch_size,
         lr,
         model_ver,
         vec_combine_method,
         save_all):
    '''
    Averagely, the time consumption:
    max_generated_batches = 5
        CPU:
            backward time: 600~650 ms
            sampling time: 1 ms
            forward time:  5~7 ms
        GPU:
            backward time: 3 ms
            sampling time: 72 ms
            forward time:  1~2 ms
    Should rewrite sampling to speed up on GPU

    DocTag2Vec on CPU:
        121882 words/s, 8 workers
        processing one document time = 650~850 ms
        training on 173403030 raw words (68590824 effective words) took 646.2s, 106138 effective words/s

    Data Generation, the major bottleneck is still generation, maybe due to the lock:
        GPU (Desktop)
            generating batch time: 1200~2001 ms, (1508425387839, 1508425389840)
            transfer batch to Torch: 1 ms, (1508425389840, 1508425389841)
            #worker = 1: 300~600 words/s
            #worker = 8: 600~4000 words/s (around 2500 often)
            After changing to torch.sampler, getting worse, data-prepare time is not stable
        CPU (Mac)
            #worker = 8:
                generating batch time: 1200~1527 ms, (1508424953768, 1508424955295)
                transfer batch to Torch: 1 ms, (1508424955295, 1508424955296)
                Generating one example time: 2~5 ms, (1508458881118, 1508458881122)
                Generating one document time: 50~400 ms, (1508458881118, 1508458881122)
                Generating one batch time: 650~700 ms, (1508458880690, 1508458881122)
            After changing to torch.sampler
                Generating one example time: 4~7 ms
                Generating one batch time: 900~1200 ms

    '''

    model = DistributedMemory(
        vec_dim,
        num_docs=len(dataset),
        num_words=vocabulary_size)

    cost_func = NegativeSampling()
    optimizer = Adam(params=model.parameters(), lr=lr)
    logger = logging.getLogger('root')

    if torch.cuda.is_available():
        model.cuda()
        logger.info("Running on GPU - CUDA")
    else:
        logger.info("Running on CPU")

    logger.info("Dataset comprised of {:d} documents.".format(len(dataset)))
    logger.info("Vocabulary size is {:d}.\n".format(vocabulary_size))
    logger.info("Training started.")

    best_loss = float_info.max
    prev_model_file_path = ""

    progbar = Progbar(num_batches, batch_size=batch_size, total_examples = number_examples)

    for epoch_i in range(num_epochs):
        epoch_start_time = time.time()
        loss = []

        for batch_i in range(num_batches):
            start_time = current_milli_time()
            batch = next(data_generator)
            current_time = current_milli_time()
            print('data-prepare time: %d ms' % (round(current_time - start_time)))

            start_time = current_milli_time()
            x = model.forward(
                batch.context_ids,
                batch.doc_ids,
                batch.target_noise_ids)
            x = cost_func.forward(x)
            loss.append(x.data[0])
            print('forward time: %d ms' % round(current_milli_time() - start_time))

            start_time = current_milli_time()
            model.zero_grad()
            x.backward()
            optimizer.step()
            print('backward time: %d ms' % round(current_milli_time() - start_time))

            progbar.update(epoch_i, batch_i, )
            # _print_progress(epoch_i, batch_i, num_batches)

        # end of epoch
        loss = torch.mean(torch.FloatTensor(loss))
        is_best_loss = loss < best_loss
        best_loss = min(loss, best_loss)
        progbar.update(epoch_i, batch_i, [('loss', loss), ('best_loss', best_loss)])

        model_file_name = MODEL_NAME.format(
            data_file_name[:-4],
            model_ver,
            vec_combine_method,
            context_size,
            num_noise_words,
            vec_dim,
            batch_size,
            lr,
            epoch_i + 1,
            loss)
        model_file_path = join(MODELS_DIR, model_file_name)
        if not os.path.exists(MODELS_DIR):
            os.makedirs(MODELS_DIR)
        state = {
            'epoch': epoch_i + 1,
            'model_state_dict': model.state_dict(),
            'best_loss': best_loss,
            'optimizer_state_dict': optimizer.state_dict()
        }
        if save_all:
            torch.save(state, model_file_path)
        elif is_best_loss:
            try:
                remove(prev_model_file_path)
            except FileNotFoundError:
                pass
            torch.save(state, model_file_path)
            prev_model_file_path = model_file_path

        epoch_total_time = round(time.time() - epoch_start_time)
        logger.info(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
def _run(data_file_name, dataset, data_generator, num_batches, vocabulary_size,
         context_size, num_noise_words, vec_dim, num_epochs, batch_size, lr,
         model_ver, vec_combine_method, save_all, generate_plot,
         model_ver_is_dbow):

    if model_ver_is_dbow:
        model = DBOW(vec_dim, num_docs=len(dataset), num_words=vocabulary_size)
    else:
        model = DM(vec_dim, num_docs=len(dataset), num_words=vocabulary_size)

    cost_func = NegativeSampling()
    optimizer = Adam(params=model.parameters(), lr=lr)

    if torch.cuda.is_available():
        model.cuda()

    print("Dataset comprised of {:d} documents.".format(len(dataset)))
    print("Vocabulary size is {:d}.\n".format(vocabulary_size))
    print("Training started.")

    best_loss = float("inf")
    prev_model_file_path = None

    for epoch_i in range(num_epochs):
        epoch_start_time = time.time()
        loss = []

        for batch_i in range(num_batches):
            batch = next(data_generator)
            if torch.cuda.is_available():
                batch.cuda_()

            if model_ver_is_dbow:
                x = model.forward(batch.doc_ids, batch.target_noise_ids)
            else:
                x = model.forward(batch.context_ids, batch.doc_ids,
                                  batch.target_noise_ids)

            x = cost_func.forward(x)

            loss.append(x.item())
            model.zero_grad()
            x.backward()
            optimizer.step()
            _print_progress(epoch_i, batch_i, num_batches)

        # end of epoch
        loss = torch.mean(torch.FloatTensor(loss))
        is_best_loss = loss < best_loss
        best_loss = min(loss, best_loss)

        state = {
            'epoch': epoch_i + 1,
            'model_state_dict': model.state_dict(),
            'best_loss': best_loss,
            'optimizer_state_dict': optimizer.state_dict()
        }

        prev_model_file_path = save_training_state(
            data_file_name, model_ver, vec_combine_method, context_size,
            num_noise_words, vec_dim, batch_size, lr, epoch_i, loss, state,
            save_all, generate_plot, is_best_loss, prev_model_file_path,
            model_ver_is_dbow)

        epoch_total_time = round(time.time() - epoch_start_time)
        print(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
Beispiel #7
0
def _run(data_file_name,
         dataset,
         data_generator,
         num_batches,
         vocabulary_size,
         context_size,
         num_noise_words,
         vec_dim,
         num_epochs,
         batch_size,
         lr,
         model_ver,
         vec_combine_method,
         save_all):

    model = DistributedMemory(
        vec_dim,
        num_docs=len(dataset),
        num_words=vocabulary_size)

    cost_func = NegativeSampling()
    optimizer = SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True)

    if torch.cuda.is_available():
        model.cuda()

    print("Dataset comprised of {:d} documents.".format(len(dataset)))
    print("Vocabulary size is {:d}.\n".format(vocabulary_size))
    print("Training started.")

    best_loss = float_info.max
    prev_model_file_path = ""

    for epoch_i in range(num_epochs):
        epoch_start_time = time.time()
        loss = []

        for batch_i in range(num_batches):
            batch = next(data_generator)
            x = model.forward(
                batch.context_ids,
                batch.doc_ids,
                batch.target_noise_ids)
            x = cost_func.forward(x)
            loss.append(x.data[0])
            model.zero_grad()
            x.backward()
            optimizer.step()
            _print_progress(epoch_i, batch_i, num_batches)

        # end of epoch
        loss = torch.mean(torch.FloatTensor(loss))
        is_best_loss = loss < best_loss
        best_loss = min(loss, best_loss)

        model_file_name = MODEL_NAME.format(
            data_file_name[:-4],
            model_ver,
            vec_combine_method,
            context_size,
            num_noise_words,
            vec_dim,
            batch_size,
            lr,
            epoch_i + 1,
            loss)
        model_file_path = join(MODELS_DIR, model_file_name)
        state = {
            'epoch': epoch_i + 1,
            'model_state_dict': model.state_dict(),
            'best_loss': best_loss,
            'optimizer_state_dict': optimizer.state_dict()
        }
        if save_all:
            torch.save(state, model_file_path)
        elif is_best_loss:
            try:
                remove(prev_model_file_path)
            except FileNotFoundError:
                pass
            torch.save(state, model_file_path)
            prev_model_file_path = model_file_path

        epoch_total_time = round(time.time() - epoch_start_time)
        print(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
Beispiel #8
0
 def setUp(self):
     self.loss_f = NegativeSampling()