Beispiel #1
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name):
        self.min_count = 5
        self.emb_dimension = 100
        self.batch_size = 64
        self.window_size = 5
        self.iteration = 1
        self.initial_lr = 0.001
        self.data = InputData(input_file_name, self.min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                             self.iteration, self.initial_lr, self.min_count)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(
            self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.data,
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(
            self.data.id2word, self.output_file_name, self.use_cuda)
class Word2Vec:
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):

        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        count = int(batch_count) // 3
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)

            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u)).cuda()
            pos_v = Variable(torch.LongTensor(pos_v)).cuda()
            neg_v = Variable(torch.LongTensor(neg_v)).cuda()
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.item(),
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            if i != 0 and i % count == 0:
                self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i))
        self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
Beispiel #3
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=50,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=1):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.

        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        # self.skip_gram_model.save_embedding(
        #     self.data.id2word, 'begin_embedding.txt', self.use_cuda)
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2word,
                                            self.output_file_name,
                                            self.use_cuda)
Beispiel #4
0
class Word2Vec:

    def __init__(
        self,
        input_path,
        output_dir,
        wordsim_path,
        dimension=100,
        batch_size=batch_size,
        window_size=5,
        epoch_count=1,
        initial_lr=1e-6,
        min_count=5,
    ):
        self.data = InputData(input_path, min_count)
        self.output_dir = output_dir
        self.vocabulary_size = len(self.data.id_from_word)
        self.dimension = dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.epoch_count = epoch_count
        self.initial_lr = initial_lr
        self.model = SkipGramModel(self.vocabulary_size, self.dimension)
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model = nn.DataParallel(self.model.to(self.device))
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr)

        if wordsim_path:
            self.wordsim_verification_tuples = []
            with open(wordsim_path, 'r') as f:
                f.readline()  # Abandon header
                for line in f:
                    word1, word2, actual_similarity = line.split(',')
                    self.wordsim_verification_tuples.append(
                        (word1, word2, float(actual_similarity))
                    )
        else:
            self.wordsim_verification_tuples = None

    def train(self):
        pair_count = self.data.get_pair_count(self.window_size)
        batch_count = self.epoch_count * pair_count / self.batch_size
        best_rho = float('-inf')
        for i in tqdm(range(int(batch_count)), total=batch_count):
            self.model.train()
            pos_pairs = self.data.get_batch_pairs(
                self.batch_size, self.window_size
            )
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = torch.tensor(pos_u, device=self.device)
            pos_v = torch.tensor(pos_v, device=self.device)
            neg_v = torch.tensor(neg_v, device=self.device)

            self.optimizer.zero_grad()
            loss = self.model(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            if i % 250 == 0:
                self.model.eval()
                rho = self.model.module.get_wordsim_rho(
                    self.wordsim_verification_tuples, self.data.id_from_word,
                    self.data.word_from_id
                )
                print(
                    f'Loss: {loss.item()},'
                    f' lr: {self.optimizer.param_groups[0]["lr"]},'
                    f' rho: {rho}'
                )
                dump_embedding(
                    self.model.module.get_embedding(
                        self.data.id_from_word, self.data.word_from_id
                    ),
                    self.model.module.dimension,
                    self.data.word_from_id,
                    os.path.join(self.output_dir, f'latest.txt'),
                )
                if rho > best_rho:
                    dump_embedding(
                        self.model.module.get_embedding(
                            self.data.id_from_word, self.data.word_from_id
                        ),
                        self.model.module.dimension,
                        self.data.word_from_id,
                        os.path.join(self.output_dir, f'{i}_{rho}.txt')
                    )
                    best_rho = rho

            # warm up
            if i < 10000:
                lr = self.initial_lr * i / 10000
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            elif i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
Beispiel #5
0
class Word2Vec:
    def __init__(self,
                 output_file_name,
                 output_sense_name,
                 emb_dimension=128,
                 K=5,
                 batch_size=1,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.1,
                 createClusterLambda=1.5,
                 min_count=0):
        """Initilize class parameters.
        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.
        Returns:
            None.
        """
        self.data = InputData(min_count)
        self.output_file_name = output_file_name
        self.output_sense_name = output_sense_name
        self.emb_size = len(self.data.node2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.K = K
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.createClusterLambda = createClusterLambda
        self.skip_gram_model = SkipGramModel(self.emb_size, self.K,
                                             self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        total_pos_pairs = self.data.get_node_pairs(self.window_size)
        print("training\n")
        for t in process_bar:
            pos_pairs = total_pos_pairs[t]
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            # right=[]
            cnt = 0
            curword = pos_u[cnt]
            contextwords = []
            contextwords_cuda = []
            while cnt < len(pos_u):
                contextwords.append(pos_v[cnt])
                contextwords_cuda.append(pos_v[cnt])
                cnt += 1
            contextembedding = torch.zeros(self.emb_dimension)
            contextwords_cuda = Variable(torch.LongTensor(contextwords_cuda))
            if self.use_cuda:
                contextwords_cuda = contextwords_cuda.cuda()
            emb_v = self.skip_gram_model.v_embeddings(contextwords_cuda)
            if self.use_cuda:
                emb_v_data = emb_v.cpu().data
            else:
                emb_v_data = emb_v.data
            for i in range(len(contextwords)):
                contextembedding += emb_v_data[i]
                # torch.add(contextembedding,emb_v_data[i,:],out=emb_v_data_total)
            emb_v_data_avg = contextembedding / (len(contextwords))
            # torch.div(emb_v_data_total,len(contextwords),out=emb_v_data_avg)
            minDist = np.inf
            rightsense = 0
            mu = torch.Tensor(self.emb_dimension)
            if self.skip_gram_model.num_sense[curword] == self.K:
                nC = self.K
            else:
                nC = self.skip_gram_model.num_sense[curword] + 1
            prob = torch.Tensor(nC)
            for k in range(self.skip_gram_model.num_sense[curword]):
                torch.div(self.skip_gram_model.clusterCenter[curword, k, :],
                          self.skip_gram_model.clusterCount[curword][k],
                          out=mu)
                x_norm = torch.norm(emb_v_data_avg, p=2)
                y_norm = torch.norm(mu, p=2)
                summ = 0
                for p in range(self.emb_dimension):
                    summ += emb_v_data_avg[p] * mu[p]
                dist = 1 - summ / (x_norm * y_norm)
                prob[k] = dist
                if dist < minDist:
                    minDist = dist
                    rightsense = k
            if self.skip_gram_model.num_sense[curword] < self.K:
                if self.createClusterLambda < minDist:
                    prob[self.skip_gram_model.
                         num_sense[curword]] = self.createClusterLambda
                    rightsense = self.skip_gram_model.num_sense[curword]
                    self.skip_gram_model.num_sense[curword] += 1
            for i in range(self.emb_dimension):
                self.skip_gram_model.clusterCenter[curword][rightsense][
                    i] += emb_v_data_avg[i]
            self.skip_gram_model.clusterCount[curword][rightsense] += 1
            # for i in range(len(contextwords)):
            #    right.append(rightsense)

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v,
                                                rightsense, self.use_cuda)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data[0], self.optimizer.param_groups[0]['lr']))
            if t * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * t / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(self.data.id2node,
                                            self.output_file_name,
                                            self.output_sense_name,
                                            self.use_cuda)
Beispiel #6
0
class Word2Vec:
    def __init__(self, wikidump_filename, output_text_filename, emb_dimension,
                 batch_size, window_size, iteration, initial_lr, min_count):

        self.data = InputData(wikidump_filename, min_count,
                              output_text_filename)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)

    def train(self):
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description(
                "Loss: %0.8f, lr: %0.6f" %
                (loss.data, self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr

    def calculate_probability(self, word1, word2):
        embeddings = self.skip_gram_model.u_embeddings.weight.data.numpy()
        embedding_1 = embeddings[self.data.word2id[word1]]
        embedding_2 = embeddings[self.data.word2id[word2]]

        numerator = np.exp(np.sum(embedding_1 * embedding_2))
        denominator = np.sum(np.exp(
            np.sum(np.multiply(embedding_1, embeddings), axis=1)),
                             axis=0)

        return (numerator / denominator)

    def wordsim353_spearman(self, input_filename):
        target_word = []
        context_word = []
        human_scores = []
        with open(input_filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            ws353_pairs = -1
            for row in csv_reader:
                if ws353_pairs == -1:
                    ws353_pairs += 1
                else:
                    target_word.append(row[0])
                    context_word.append(row[1])
                    human_scores.append(float(row[2]))
                    ws353_pairs += 1

        for pair in range(0, ws353_pairs):
            if (target_word[pair] not in self.data.word2id):
                raise Exception('Target word not in model vocab: ',
                                target_word[pair])
            if (context_word[pair] not in self.data.word2id):
                raise Exception('Context word not in model vocab: ',
                                context_word[pair])

        human_rankings = ss.rankdata(human_scores)

        machine_scores = []
        for pair in range(0, len(human_scores)):
            machine_scores.append(
                self.calculate_probability(target_word[pair],
                                           context_word[pair]))
        machine_rankings = ss.rankdata(machine_scores)

        human_scores_dict = dict()
        machine_scores_dict = dict()
        for pair in range(0, len(human_scores)):
            human_scores_dict[pair] = human_rankings[pair]
            machine_scores_dict[pair] = machine_rankings[pair]

        return spearman.spearman_correlation(human_scores_dict,
                                             machine_scores_dict)