Beispiel #1
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name):
        self.min_count = 5
        self.emb_dimension = 100
        self.batch_size = 64
        self.window_size = 5
        self.iteration = 1
        self.initial_lr = 0.001
        self.data = InputData(input_file_name, self.min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                             self.iteration, self.initial_lr, self.min_count)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(
            self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.data,
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(
            self.data.id2word, self.output_file_name, self.use_cuda)
class Word2Vec:
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):

        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        count = int(batch_count) // 3
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)

            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u)).cuda()
            pos_v = Variable(torch.LongTensor(pos_v)).cuda()
            neg_v = Variable(torch.LongTensor(neg_v)).cuda()
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.item(),
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            if i != 0 and i % count == 0:
                self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i))
        self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
Beispiel #3
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=50,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=1):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        # self.skip_gram_model.save_embedding(
        #     self.data.id2word, 'begin_embedding.txt', self.use_cuda)
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name,
                                            self.use_cuda)
Beispiel #4
0
class Word2Vec:
    def __init__(
        self,
        input_file_name,
        input_wvectors,
        input_cvectors,
        input_ps,
        input_ns,
        output_file_name,
        emb_dimension=100,
        batch_size=50,
        window_size=5,
        kn=20,
        iteration=1,
        initial_lr=0.001,
        clip=1.0,
        min_count=30,
        batch_num_to_valid=100000,
    ):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            input_vectors: Pretrained vector
            input_psns: Pretrained positive sample & negative sample
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            kn: k neighbors.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.pre_wvectors = InputVector(input_wvectors)
        self.pre_cvectors = InputVector(input_cvectors)
        self.ps_w = load_from_pkl(input_ps)
        self.ns_w = load_from_pkl(input_ns)
        self.ps = convert_word_to_id(self.ps_w, self.data.word2id)
        self.ns = convert_word_to_id(self.ns_w, self.data.word2id)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.kn = kn
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.clip = clip
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension,
                                             self.pre_wvectors,
                                             self.pre_cvectors)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
        self.batch_num_to_valid = batch_num_to_valid

    def train(self, similarity_test_paths, synset_paths, analogy_paths):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        # self.skip_gram_model.save_embedding(
        #     self.data.id2word, 'begin_embedding.txt', self.use_cuda)

        best_scores = dict()
        tmp_emb_dir = os.path.join(tempfile.gettempdir(), 'embedding')
        tmp_emb_path = os.path.join(
            tmp_emb_dir,
            ''.join(random.sample(string.ascii_letters + string.digits, 16)))

        for epoch in range(self.iteration):
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                      self.window_size)
                pos_u, mask_pos_u = self.data.get_ps_batch(
                    pos_pairs, self.ps, self.kn)
                neg_u, mask_neg_u = self.data.get_ns_batch(
                    pos_pairs, self.ns, self.kn)
                pair_u = [pair[0] for pair in pos_pairs]
                pair_v = [pair[1] for pair in pos_pairs]

                pair_u = Variable(torch.LongTensor(pair_u))
                pair_v = Variable(torch.LongTensor(pair_v))
                pos_u = Variable(torch.LongTensor(pos_u))
                mask_pos_u = Variable(torch.FloatTensor(mask_pos_u))
                neg_u = Variable(torch.LongTensor(neg_u))
                mask_neg_u = Variable(torch.FloatTensor(mask_neg_u))
                if self.use_cuda:
                    pair_u = pair_u.cuda()
                    pair_v = pair_v.cuda()
                    pos_u = pos_u.cuda()
                    mask_pos_u = mask_pos_u.cuda()
                    neg_u = neg_u.cuda()
                    mask_neg_u = mask_neg_u.cuda()

                self.optimizer.zero_grad()
                '''
                param = self.skip_gram_model.parameters()
                tmp = []
                try:
                    while True:
                        tmp.append(param.__next__())
                except:
                    pass
                '''
                loss = self.skip_gram_model.forward(pair_u, pair_v, pos_u,
                                                    mask_pos_u, neg_u,
                                                    mask_neg_u)
                loss.backward()
                torch.nn.utils.clip_grad_norm(
                    self.skip_gram_model.parameters(), self.clip)
                self.optimizer.step()

                process_bar.set_description(
                    "Loss: %0.8f, lr: %0.6f" %
                    (loss.data[0], self.optimizer.param_groups[0]['lr']))
                if i * self.batch_size % 100000 == 0:
                    lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = lr

                if i % self.batch_num_to_valid == 0:
                    logging.info('epoch%d_batch%d, evaluating...' % (epoch, i))
                    self.save_embedding(self.data.id2word, tmp_emb_path,
                                        self.use_cuda)

                    best_scores, save_flag = evaluation(
                        tmp_emb_path, similarity_test_paths, synset_paths,
                        analogy_paths, best_scores)
                    if save_flag == True:
                        emb_save_path = self.output_file_name + "_epoch%d_batch%d" % (
                            epoch, i)
                        shutil.move(tmp_emb_path, emb_save_path)
                        logging.info('Save current embedding to %s' %
                                     emb_save_path)

            self.skip_gram_model.save_embedding(self.data.id2word,
                                                self.output_file_name,
                                                self.use_cuda)
            logging.info('final evaluating...')
            self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda)
            best_scores, save_flag = evaluation(tmp_emb_path,
                                                similarity_test_paths,
                                                synset_paths, analogy_paths,
                                                best_scores)
            if save_flag == True:
                emb_save_path = self.output_file_name + "_epoch%d" % epoch
                shutil.move(tmp_emb_path, emb_save_path)
                logging.info('Save current embedding to %s' % emb_save_path)

    def save_embedding(self, id2word, file_name, use_cuda):
        """Save all embeddings to file.

        As this class only record word id, so the map from id to word has to be transfered from outside.

        Args:
            id2word: map from word id to word.
            file_name: file name.
        Returns:
            None.
        """
        if use_cuda:
            embedding = self.skip_gram_model.u_embeddings.weight.cpu(
            ).data.numpy()
        else:
            embedding = self.skip_gram_model.u_embeddings.weight.data.numpy()
        fout = open(file_name, 'w')
        fout.write('%d %d\n' % (len(id2word), self.emb_dimension))
        for wid, w in id2word.items():
            e = embedding[wid]
            e = ' '.join(map(lambda x: str(x), e))
            fout.write('%s %s\n' % (w, e))
Beispiel #5
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):

        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)

    def skip_gram_train(self):
        """Multiple training.

        Returns:
            None.
        """
        print("Skip_Gram Training......")
        pair_count = self.data.evaluate_pair_count(self.window_size)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            'skip_gram_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling(
                    pos_pairs, 5)

            pos_u = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [int(pair[0]) for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        print("Skip_Gram Trained and Saving File......")
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name)
        print("Skip_Gram Trained and Saved File.")

    def cbow_train(self):
        print("CBOW Training......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       'cbow_begin_embedding.txt')
        pos_all_pairs = self.data.get_cbow_batch_all_pairs(
            self.batch_size, self.context_size)
        pair_count = len(pos_all_pairs)
        process_bar = tqdm(range(int(pair_count / self.batch_size)))
        for _ in process_bar:
            pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size,
                                                       self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling(
                    pos_pairs, self.context_size)

            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()
        print("CBOW Trained and Saving File......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       self.output_file_name)
        print("CBOW Trained and Saved File.")
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.
            using_hs: Whether using hierarchical softmax.

        Returns:
            None.
        """
        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        print("Input Data", self.data)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        print("emb_size", self.emb_size)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size)
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)

    # @profile
    def skip_gram_train(self):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            'skip_gram_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling(
                    pos_pairs, 5)

            pos_u = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [int(pair[0]) for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name)

    def cbow_train(self):
        print("CBOW Training......")
        pair_count = self.data.evaluate_pair_count(self.context_size * 2 + 1)
        print("pair_count", pair_count)
        batch_count = self.iteration * pair_count / self.batch_size
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        self.cbow_model.save_embedding(self.data.id2word,
                                       'cbow_begin_embedding.txt')
        for i in process_bar:
            pos_pairs = self.data.get_cbow_batch_all_pairs(
                self.batch_size, self.context_size)
            if self.using_hs:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman(
                    pos_pairs)
            else:
                pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling(
                    pos_pairs, self.context_size)

            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v)
            # loss = self.cbow_model.forwards(pos_v, pos_u, neg_v, neg_u)
            loss.backward()
            self.optimizer.step()
            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            print("Loss: %0.8f, lr: %0.6f" %
                  (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        print("CBOW Trained and Saving File......")
        self.cbow_model.save_embedding(self.data.id2word,
                                       self.output_file_name)
        print("CBOW Trained and Saved File.")
Beispiel #7
0
class Word2Vec:
    def __init__(self,
                 output_file_name,
                 output_sense_name,
                 emb_dimension=128,
                 K=5,
                 batch_size=1,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.1,
                 createClusterLambda=1.5,
                 min_count=0):
        """Initilize class parameters.
        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.
        Returns:
            None.
        """
        self.data = InputData(min_count)
        self.output_file_name = output_file_name
        self.output_sense_name = output_sense_name
        self.emb_size = len(self.data.node2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.K = K
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.createClusterLambda = createClusterLambda
        self.skip_gram_model = SkipGramModel(self.emb_size, self.K,
                                             self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        total_pos_pairs = self.data.get_node_pairs(self.window_size)
        print("training\n")
        for t in process_bar:
            pos_pairs = total_pos_pairs[t]
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            # right=[]
            cnt = 0
            curword = pos_u[cnt]
            contextwords = []
            contextwords_cuda = []
            while cnt < len(pos_u):
                contextwords.append(pos_v[cnt])
                contextwords_cuda.append(pos_v[cnt])
                cnt += 1
            contextembedding = torch.zeros(self.emb_dimension)
            contextwords_cuda = Variable(torch.LongTensor(contextwords_cuda))
            if self.use_cuda:
                contextwords_cuda = contextwords_cuda.cuda()
            emb_v = self.skip_gram_model.v_embeddings(contextwords_cuda)
            if self.use_cuda:
                emb_v_data = emb_v.cpu().data
            else:
                emb_v_data = emb_v.data
            for i in range(len(contextwords)):
                contextembedding += emb_v_data[i]
                # torch.add(contextembedding,emb_v_data[i,:],out=emb_v_data_total)
            emb_v_data_avg = contextembedding / (len(contextwords))
            # torch.div(emb_v_data_total,len(contextwords),out=emb_v_data_avg)
            minDist = np.inf
            rightsense = 0
            mu = torch.Tensor(self.emb_dimension)
            if self.skip_gram_model.num_sense[curword] == self.K:
                nC = self.K
            else:
                nC = self.skip_gram_model.num_sense[curword] + 1
            prob = torch.Tensor(nC)
            for k in range(self.skip_gram_model.num_sense[curword]):
                torch.div(self.skip_gram_model.clusterCenter[curword, k, :],
                          self.skip_gram_model.clusterCount[curword][k],
                          out=mu)
                x_norm = torch.norm(emb_v_data_avg, p=2)
                y_norm = torch.norm(mu, p=2)
                summ = 0
                for p in range(self.emb_dimension):
                    summ += emb_v_data_avg[p] * mu[p]
                dist = 1 - summ / (x_norm * y_norm)
                prob[k] = dist
                if dist < minDist:
                    minDist = dist
                    rightsense = k
            if self.skip_gram_model.num_sense[curword] < self.K:
                if self.createClusterLambda < minDist:
                    prob[self.skip_gram_model.
                         num_sense[curword]] = self.createClusterLambda
                    rightsense = self.skip_gram_model.num_sense[curword]
                    self.skip_gram_model.num_sense[curword] += 1
            for i in range(self.emb_dimension):
                self.skip_gram_model.clusterCenter[curword][rightsense][
                    i] += emb_v_data_avg[i]
            self.skip_gram_model.clusterCount[curword][rightsense] += 1
            # for i in range(len(contextwords)):
            #    right.append(rightsense)

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v,
                                                rightsense, self.use_cuda)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if t * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * t / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2node,
                                            self.output_file_name,
                                            self.output_sense_name,
                                            self.use_cuda)
Beispiel #8
0
class Word2Vec:
    def __init__(self, wikidump_filename, output_text_filename, emb_dimension,
                 batch_size, window_size, iteration, initial_lr, min_count):

        self.data = InputData(wikidump_filename, min_count,
                              output_text_filename)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data, self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr

    def calculate_probability(self, word1, word2):
        embeddings = self.skip_gram_model.u_embeddings.weight.data.numpy()
        embedding_1 = embeddings[self.data.word2id[word1]]
        embedding_2 = embeddings[self.data.word2id[word2]]

        numerator = np.exp(np.sum(embedding_1 * embedding_2))
        denominator = np.sum(np.exp(
            np.sum(np.multiply(embedding_1, embeddings), axis=1)),
                             axis=0)

        return (numerator / denominator)

    def wordsim353_spearman(self, input_filename):
        target_word = []
        context_word = []
        human_scores = []
        with open(input_filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            ws353_pairs = -1
            for row in csv_reader:
                if ws353_pairs == -1:
                    ws353_pairs += 1
                else:
                    target_word.append(row[0])
                    context_word.append(row[1])
                    human_scores.append(float(row[2]))
                    ws353_pairs += 1

        for pair in range(0, ws353_pairs):
            if (target_word[pair] not in self.data.word2id):
                raise Exception('Target word not in model vocab: ',
                                target_word[pair])
            if (context_word[pair] not in self.data.word2id):
                raise Exception('Context word not in model vocab: ',
                                context_word[pair])

        human_rankings = ss.rankdata(human_scores)

        machine_scores = []
        for pair in range(0, len(human_scores)):
            machine_scores.append(
                self.calculate_probability(target_word[pair],
                                           context_word[pair]))
        machine_rankings = ss.rankdata(machine_scores)

        human_scores_dict = dict()
        machine_scores_dict = dict()
        for pair in range(0, len(human_scores)):
            human_scores_dict[pair] = human_rankings[pair]
            machine_scores_dict[pair] = machine_rankings[pair]

        return spearman.spearman_correlation(human_scores_dict,
                                             machine_scores_dict)