Example #1
0
def find_most_similar_words(model=None, word_vec_path=None, word=None):
    # === Load dictionary
    dict_dir = os.path.join(get_data_dir(), "book_dict")

    word_cnt_dict_path = os.path.join(dict_dir, "word_cnt_dict.pkl")
    word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path)
    print(len(word_cnt_dict))

    word2id_dict_path = os.path.join(dict_dir, "word2id_dict.pkl")
    word2id_dict = load_dictionary(dict_path=word2id_dict_path)
    print(len(word2id_dict))

    id2word_dict_path = os.path.join(dict_dir, "id2word_dict.pkl")
    id2word_dict = load_dictionary(dict_path=id2word_dict_path)
    print(len(id2word_dict))

    if word not in word2id_dict:
        print('%s not in dict' % word)
        return None

    word_idx = word2id_dict[word]

    # === Load word_vec
    word_vecs = []
    with open(word_vec_path, 'r') as f:
        for line in f:
            buf = line[:-1].split('\t')
            vec = np.array(list(map(lambda x: float(x), buf[1].split(','))))
            word_vecs.append(vec)

    word_vecs = np.array(word_vecs)

    # === Find
    word_vec = word_vecs[word_idx]
    print('word_idx', word_idx)

    sims = np.dot(word_vecs, word_vec)
    ranks = np.argsort(-sims)
    print('ranks', ranks[:20])
    print('scores', sims[ranks[:20]])
    sim_words = [id2word_dict[idx] for idx in ranks[:20]]
    print("Top sim words of '%s' are: " % word)
    print(sim_words)
Example #2
0
    def sample(self, num_sample=None, method="random"):
        if method == "random":
            return self._random_sample(num_sample)
        elif method == "weighted":
            return self._weighted_sample(num_sample)

    def _random_sample(self, num_sample):
        return np.random.choice(self.word_idxes, num_sample)

    def _weighted_sample(self, num_sample):
        return np.random.choice(self.word_idxes, num_sample, p=self.probs)


if __name__ == '__main__':
    dict_dir = os.path.join(get_data_dir(), "book_dict")

    word_cnt_dict_path = os.path.join(dict_dir, "word_cnt_dict.pkl")
    word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path)
    print(len(word_cnt_dict))

    word2id_dict_path = os.path.join(dict_dir, "word2id_dict.pkl")
    word2id_dict = load_dictionary(dict_path=word2id_dict_path)
    print(len(word2id_dict))

    id2word_dict_path = os.path.join(dict_dir, "id2word_dict.pkl")
    id2word_dict = load_dictionary(dict_path=id2word_dict_path)
    print(len(id2word_dict))

    #word_cnt_dict = {"word": 100, 2: 200, 3: 300}
    #word2id_dict = {"machine": 0, 2: 1, 3: 2}
Example #3
0
    # === Find
    word_vec = word_vecs[word_idx]
    print('word_idx', word_idx)

    sims = np.dot(word_vecs, word_vec)
    ranks = np.argsort(-sims)
    print('ranks', ranks[:20])
    print('scores', sims[ranks[:20]])
    sim_words = [id2word_dict[idx] for idx in ranks[:20]]
    print("Top sim words of '%s' are: " % word)
    print(sim_words)


if __name__ == '__main__':
    checkpoint_dir = os.path.join(get_model_dir(), "word2vec")
    word_vec_path = os.path.join(get_data_dir(), "word_vectors")

    vocab_size = 10001 # ptb, min_cnt = 5
    window_size = 5
    num_neg = 5
    embedding_dim = 64

    # === Load model
    checkpoint_dir = os.path.join(get_model_dir(), "word2vec")
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

    w2v = Word2vec(vocab_size=vocab_size,
                   window_size=window_size,
                   num_neg=num_neg,
                   embedding_dim=embedding_dim)
Example #4
0
                    yield contexts, target, negatives

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_shapes=((window_size*2, ), (1, ), (num_neg, )),
        output_types=(tf.int32, tf.int32, tf.int32)
    )

    return dataset.repeat(count=epochs)\
        .shuffle(buffer_size=shuffle_buffer_size)\
        .batch(batch_size=batch_size)


if __name__ == "__main__":
    train_path = os.path.join(get_data_dir(), "ptb.train.txt")
    val_path = os.path.join(get_data_dir(), "ptb.valid.txt")

    dict_dir = os.path.join(get_data_dir(), "book_dict")

    """
    train_dataset = get_dataset(input_path=train_path,
                                dict_dir=dict_dir)
    print("===Train===")
    for data in train_dataset.take(2):
        print(data)
    """



    model = Word2vec()
Example #5
0
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:

            """
            if len(line[:-1].split(' ')) < 7: # window_size*2+1
                continue
            """

            ratio = np.random.random()
            if ratio < train_ratio:
                fw_train.write(line)
            else:
                fw_val.write(line)

    fw_train.close()
    fw_val.close()


if __name__ == '__main__':
    data_dir = get_data_dir()
    data_path = os.path.join(data_dir, "book_text.txt")
    train_path = os.path.join(data_dir, "book_text_train.txt")
    val_path = os.path.join(data_dir, "book_text_val.txt")
    train_ratio = 0.8

    train_val_split(data_path=data_path,
                    train_path=train_path,
                    val_path=val_path,
                    train_ratio=train_ratio)

    print("Split done! train_path: %s" % train_path)
Example #6
0
def train_word2vec():
    vocab_size = 10001  # min_cnt=5, ptb
    total_num_train = 971657
    total_num_val = 77130

    shuffle_buffer_size = 2048 * 2
    epochs = 10
    batch_size = 128
    window_size = 5
    num_neg = 5
    embedding_dim = 64  # To tune

    train_path = os.path.join(get_data_dir(), "ptb.train.txt")
    val_path = os.path.join(get_data_dir(), "ptb.valid.txt")
    dict_dir = os.path.join(get_data_dir(), "book_dict")

    train_dataset = get_dataset(input_path=train_path,
                                dict_dir=dict_dir,
                                shuffle_buffer_size=shuffle_buffer_size,
                                epochs=epochs,
                                batch_size=batch_size,
                                window_size=window_size,
                                num_neg=num_neg)

    val_dataset = get_dataset(input_path=val_path,
                              dict_dir=dict_dir,
                              shuffle_buffer_size=shuffle_buffer_size,
                              epochs=epochs,
                              batch_size=batch_size,
                              window_size=window_size,
                              num_neg=num_neg)

    optimizer = tf.keras.optimizers.Adam(0.001)

    model = Word2vec(vocab_size=vocab_size,
                     window_size=window_size,
                     num_neg=num_neg,
                     embedding_dim=embedding_dim)

    checkpoint_dir = os.path.join(get_model_dir(), "word2vec")
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

    total_train_batch = total_num_train // batch_size + 1

    # === Train
    start = time.time()
    for epoch in range(epochs):
        total_loss = 0
        batch_loss = 0

        #total_train_batch = 101 # just for debug

        epoch_start = time.time()
        i = 0
        for batch_idx, (contexts, target,
                        negatives) in zip(range(total_train_batch),
                                          train_dataset):
            i += 1
            cur_loss = train_step(model, optimizer, contexts, target,
                                  negatives)
            batch_loss += cur_loss

            if i % 100 == 0:
                batch_end = time.time()
                batch_last = batch_end - start
                print(
                    "Epoch: %d/%d, batch: %d/%d, batch_loss: %.4f, cur_loss: %.4f, lasts: %.2fs"
                    % (epoch + 1, epochs, batch_idx + 1, total_train_batch,
                       batch_loss / (batch_idx + 1), cur_loss, batch_last))

        assert i > 0
        batch_loss /= i

        total_loss += batch_loss
        epoch_end = time.time()
        epoch_last = epoch_end - epoch_start
        print("Epoch: %d/%d, loss: %.4f, lasts: %.2fs" %
              (epoch + 1, epochs, total_loss / (epoch + 1), epoch_last))

        # === Test sim
        """
        # [vocab_size, embedding_dim]
        weights = model.output_embedding_layer.get_weights()
        weights = np.array(weights[0])
        print(weights.shape)
        # computer: 236
        #sample_word_idx = 236

        # [embedding_dim, ]
        sample = weights[236]

        scores = np.dot(weights, sample)
        rank = np.argsort(-scores)
        top = rank[:20]
        print("top", top)
        print("score", scores[top])
        """

        checkpoint.save(file_prefix=checkpoint_prefix)
        #print(model.output_embedding_layer.get_weights())
        #print(get_word_representation(model=model))

    end = time.time()
    last = end - start
    print("Lasts %.2fs" % last)
Example #7
0
                continue

            for word in line.split(' '):
                if word not in word_cnt_dict:
                    word_cnt_dict[word] = 1
                else:
                    word_cnt_dict[word] += 1

    # === Save word_cnt_dict
    with open(dict_path, 'wb') as fw:
        pickle.dump(word_cnt_dict, fw)


if __name__ == "__main__":
    #text_path = os.path.join(get_data_dir(), "book_text.txt")
    text_path = os.path.join(get_data_dir(), "ptb_train_val.txt")
    dict_dir = os.path.join(get_data_dir(), "book_dict")

    word_cnt_dict_path = os.path.join(dict_dir, "word_cnt_dict.pkl")

    count_word(text_path=text_path, dict_path=word_cnt_dict_path)

    build_dictionary(dict_dir=dict_dir, min_word_count=5)

    word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path)
    print(len(word_cnt_dict))

    word2id_dict_path = os.path.join(dict_dir, "word2id_dict.pkl")
    word2id_dict = load_dictionary(dict_path=word2id_dict_path)
    print(len(word2id_dict))
    id2word_dict_path = os.path.join(dict_dir, "id2word_dict.pkl")
Example #8
0
        training_num = 0

        for line in f:
            for word in line[:-1].split(' '):
                if word not in word_cnt_dict:
                    continue
                cnt = word_cnt_dict[word]

                if cnt < min_word_cnt:
                    continue

                training_num += 1

        return training_num


if __name__ == '__main__':
    #train_path = os.path.join(get_data_dir(), "shuf_train.txt")
    train_path = os.path.join(get_data_dir(), "ptb.valid.txt")
    word_cnt_dict_path = os.path.join(get_data_dir(), "book_dict",
                                      "word_cnt_dict.pkl")
    min_word_cnt = 5

    word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path)
    print(len(word_cnt_dict))

    training_num = count_training_num(input_path=train_path,
                                      word_cnt_dict=word_cnt_dict,
                                      min_word_cnt=min_word_cnt)
    print(training_num)