Exemple #1
0
class Word2VecTrainer:
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=10,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12):

        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):

        for iteration in range(self.iterations):

            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                         lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    #if i > 0 and i % 500 == 0:

            print(" Loss: " + str(running_loss))
            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)
Exemple #2
0
class Word2VecTrainer:
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12,
                 reg=None,
                 display=False,
                 end_of_step=None):
        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.current_iteration = 0
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.reg = reg
        self.history = {'main': []}
        self.display = display
        self.end_of_step = end_of_step

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

        self.word2vec = {}

    def _update_word2vec_dict(self):
        u_embeddings = self.skip_gram_model.u_embeddings.cpu()
        words = self.data.words

        for word in words:
            wid = self.data.word2id[word]
            v = u_embeddings(torch.LongTensor([wid])).detach().numpy()[0]
            self.word2vec[word] = v

        self.skip_gram_model.u_embeddings.cpu().to(self.device)

    def _display_progress(self, dots_0=150, dots_1=30):
        fig, ax = plt.subplots(2, 1, figsize=(20, 10))
        ax[0].title.set_text('Iteration: {}'.format(self.current_iteration +
                                                    1))
        n = len(self.history['main'])
        d = n // dots_0
        p_0 = np.zeros(dots_0)
        p_1 = np.zeros(dots_1)
        for key in self.history:
            p_0 += [
                np.mean(self.history[key][i * d:(i + 1) * d])
                for i in range(dots_0)
            ]
            p_1 += self.history[key][-dots_1:]
            ax[0].plot(p_0)
            ax[1].plot(p_1)
        ax[0].legend(self.history.keys())
        ax[1].legend(self.history.keys())
        plt.show()
        clear_output(True)

    def train(self):
        for iteration in range(self.current_iteration, self.iterations):
            optimizer = optim.SparseAdam(nn.ParameterList(
                self.skip_gram_model.parameters()),
                                         lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(self.dataloader):
                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    self.history['main'].append(loss.cpu().detach())
                    if self.reg:
                        loss += self.reg(self, pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

                    running_loss = running_loss * 0.9 + loss.item() * 0.1

                    if self.end_of_step:
                        self.end_of_step(i)
            if self.display:
                self._display_progress()
            else:
                print("Iteration: {}, Loss: {}".format(iteration,
                                                       running_loss))

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)
            self.current_iteration += 1
        self._update_word2vec_dict()
Exemple #3
0
class Word2Vec:
    def __init__(self,
                 log_filename: str,
                 output_filename: str,
                 embedding_dimension: int = 100,
                 batch_size: int = 128,
                 iteration: int = 1,
                 initial_lr: float = 0.025,
                 min_count: int = 5,
                 sub_sampling_t: float = 1e-5,
                 neg_sampling_t: float = 0.75,
                 neg_sample_count: int = 5,
                 half_window_size: int = 2,
                 read_data_method: str = 'memory'):
        """
        init func

        """
        self.data = DataHanlder(log_filename=log_filename,
                                batch_size=batch_size,
                                min_count=min_count,
                                sub_sampling_t=sub_sampling_t,
                                neg_sampling_t=neg_sampling_t,
                                neg_sample_count=neg_sample_count,
                                half_window_size=half_window_size,
                                read_data_method=read_data_method)
        self.output_filename = output_filename
        self.embedding_dimension = embedding_dimension
        self.batch_size = batch_size
        self.half_window_size = half_window_size
        self.iter = iteration
        self.initial_lr = initial_lr
        self.sg_model = SkipGramModel(len(self.data.vocab),
                                      self.embedding_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.sg_model.cuda()
        self.optimizer = optim.SGD(self.sg_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        i = 0
        # total 2 * self.half_window_size * self.data.total_word_count,
        # for each sent, (1 + 2 + .. + half_window_size) * 2 more pairs has been calculated, over all * sent_len
        # CAUTION: IT IS NOT AN ACCURATE NUMBER, JUST APPROXIMATELY COUNT.
        approx_pair = 2 * self.half_window_size * self.data.total_word_count - \
                      (1 + self.half_window_size) * self.half_window_size * self.data.sentence_len
        batch_count = self.iter * approx_pair / self.batch_size
        for pos_u, pos_v, neg_samples in self.data.gen_batch():
            i += 1
            if self.data.sentence_cursor > self.data.sentence_len * self.iter:
                # reach max iter
                break
            # train iter
            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_samples))
            if self.use_cuda:
                pos_u, pos_v, neg_v = [i.cuda() for i in (pos_u, pos_v, neg_v)]

            # print(len(pos_u), len(pos_v), len(neg_v))
            self.optimizer.zero_grad()
            loss = self.sg_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            if i % 100 == 0:
                # print(loss)
                print("step: %d, Loss: %0.8f, lr: %0.6f" %
                      (i, loss.item(), self.optimizer.param_groups[0]['lr']))
            if i % (100000 // self.batch_size) == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr

        self.sg_model.save_embedding(self.data.id2word, self.output_filename,
                                     self.use_cuda)
Exemple #4
0
class Word2VecTrainer:
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12,
                 num_workers=0,
                 collate_fn='custom',
                 iprint=500,
                 t=1e-3,
                 ns_exponent=0.75,
                 optimizer='adam',
                 optimizer_kwargs=None,
                 warm_start_model=None,
                 lr_schedule=True,
                 sparse=True):

        self.data = DataReader(input_file,
                               min_count,
                               t=t,
                               ns_exponent=ns_exponent)
        dataset = Word2vecDataset(self.data, window_size)
        if collate_fn == 'custom':
            collate_fn = dataset.collate
        else:
            collate_fn = None
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=num_workers,
                                     collate_fn=collate_fn,
                                     worker_init_fn=dataset.worker_init_fn)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.iprint = iprint
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size,
                                             self.emb_dimension,
                                             sparse=sparse)

        if warm_start_model is not None:
            self.skip_gram_model.load_state_dict(torch.load(warm_start_model),
                                                 strict=False)
        self.optimizer = optimizer
        if optimizer_kwargs is None:
            optimizer_kwargs = {}
        self.optimizer_kwargs = optimizer_kwargs
        self.lr_schedule = lr_schedule
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()

    def train(self):
        if self.optimizer == 'adam':
            optimizer = optim.Adam(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr,
                                   **self.optimizer_kwargs)
        elif self.optimizer == 'sparse_adam':
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(),
                                         lr=self.initial_lr,
                                         **self.optimizer_kwargs)
        elif self.optimizer == 'sgd':
            optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                  lr=self.initial_lr,
                                  **self.optimizer_kwargs)
        elif self.optimizer == 'asgd':
            optimizer = optim.ASGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr,
                                   **self.optimizer_kwargs)
        elif self.optimizer == 'adagrad':
            optimizer = optim.Adagrad(self.skip_gram_model.parameters(),
                                      lr=self.initial_lr,
                                      **self.optimizer_kwargs)
        else:
            raise Exception('Unknown optimizer!')

        for iteration in range(self.iterations):

            print("\n\n\nIteration: " + str(iteration + 1))

            if self.lr_schedule:
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                    optimizer, len(self.dataloader))
            running_loss = 0.0
            iprint = len(self.dataloader) // 20
            for i, sample_batched in enumerate(tqdm(self.dataloader)):

                if len(sample_batched[0]) > 1:
                    pos_u = sample_batched[0].to(self.device)
                    pos_v = sample_batched[1].to(self.device)
                    neg_v = sample_batched[2].to(self.device)

                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    optimizer.step()
                    if self.lr_schedule:
                        scheduler.step()

                    running_loss = running_loss * (
                        1 - 5 / iprint) + loss.item() * (5 / iprint)
                    if i > 0 and i % iprint == 0:
                        print(" Loss: " + str(running_loss) + ' lr: ' + str([
                            param_group['lr']
                            for param_group in optimizer.param_groups
                        ]))
            print(" Loss: " + str(running_loss))

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name)