def build_doc_dataset(docs, vocabulary_size=50000):
	'''
	Build the dictionary and replace rare words with UNK token.
	
	Parameters
	----------
	docs: list of token lists, each token list represent a sentence
	vocabulary_size: maximum number of top occurring tokens to produce, 
		rare tokens will be replaced by 'UNK'
	'''
	count = [['UNK', -1]]
	words = reduce(concat_lists, docs)

	doc_ids = [] # collect document(sentence) indices
	for i, doc in enumerate(docs):
		doc_ids.extend([i] * len(doc))

	word_ids, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size=vocabulary_size)

	return doc_ids, word_ids, count, dictionary, reverse_dictionary
Beispiel #2
0
def build_doc_dataset(docs, vocabulary_size=50000):
    '''
    Build the dictionary and replace rare words with UNK token.

    Parameters
    ----------
    docs: list of token lists, each token list represent a sentence
    vocabulary_size: maximum number of top occurring tokens to produce,
        rare tokens will be replaced by 'UNK'
    '''
    count = [['UNK', -1]]
    words = reduce(concat_lists, docs)

    doc_ids = []  # collect document(sentence) indices
    for i, doc in enumerate(docs):
        doc_ids.extend([i] * len(doc))

    word_ids, count, dictionary, reverse_dictionary = build_dataset(
        words, vocabulary_size=vocabulary_size)

    return doc_ids, word_ids, count, dictionary, reverse_dictionary
Beispiel #3
0
from itertools import chain

import tensorflow as tf
from six.moves import xrange
from sklearn.manifold import TSNE

from word2vec import read_data, build_dataset, plot_with_labels, cosine_similarity
from word2vec.skip_gram_model import generate_batch, generate_graph

if __name__ == '__main__':

    vocabulary_size = 10000

    words = read_data()
    words = [x for sublist in words for x in sublist]
    data, count, dictionary, reverse_dictionary = build_dataset(
        vocabulary_size, words)

    del words  # 释放内存

    batch_size = 128
    embedding_size = 128  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.

    valid_size = 9  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    num_sampled = 64  # Number of negative examples to sample.
    # Input data.
    valid_word = ['主任', '语言表达', '观察', '床头', '护理', '病人', '研究成果', '省部级', '引进']
    valid_examples = [dictionary[li] for li in valid_word]