Exemple #1
0
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

num_points = 400

# Preparing or loading the dataset.
_, data, count, dictionary, reverse_dictionary = \
    text8.prepare_dataset(vocabulary_size, data_folder)

# Initialisaing batch generators.
skipgram_batches = batch.SkipgramBatchGenerator(data, batch_size_skip, num_skips, skip_window)
cbow_batches = batch.CBOWBatchGenerator(data, batch_size_cbow, skip_window)

# Skipgram.
tf_graph, optimizer, loss, normalized_embeddings, similarity = \
    models.skipgram_model(vocabulary_size, embedding_size, batch_size_skip, num_sampled,
        valid_examples, learning_rate)

print('Training using skipgram model...')
normalized_embeddings = graph_optimisation.run_embedding(tf_graph, optimizer, loss, similarity,
    normalized_embeddings, skipgram_batches, valid_examples, reverse_dictionary, num_steps)

training.utils.plot_embedding(normalized_embeddings[1:num_points+1],
learning_rate_decay_factor = 0.8
num_steps = 5000

verbose_frequency = 200

def create_set(words):
    data = []
    for w in words:
        in_word = [dataset.utils.seq2seq_char2id(c) for c in w]
        # The output should be the reversed word and the
        # "end of sequence" symbol.
        out_word = in_word[::-1] + [dataset.utils.EOS_ID]
        data.append((in_word, out_word))
    return data

_, _, _, dictionary, _ = text8.prepare_dataset(vocabulary_size, data_folder)

# Removing UNK token and words that are longer than the max length allowed.
dictionary.pop("UNK", None)
too_long = {w for w in dictionary.keys() if len(w) > max_word_length}
for w in too_long:
    dictionary.pop(w, None)

# Creating a simple dataset from the 50000 words (or less) in the vocabulary.
words = list(dictionary.keys())

train_words = words[num_valid_words:]
valid_words = words[:num_valid_words]

train_data = create_set(train_words)
valid_data = create_set(valid_words)