skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. # We pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.array(random.sample(range(valid_window), valid_size)) num_sampled = 64 # Number of negative examples to sample. num_points = 400 # Preparing or loading the dataset. _, data, count, dictionary, reverse_dictionary = \ text8.prepare_dataset(vocabulary_size, data_folder) # Initialisaing batch generators. skipgram_batches = batch.SkipgramBatchGenerator(data, batch_size_skip, num_skips, skip_window) cbow_batches = batch.CBOWBatchGenerator(data, batch_size_cbow, skip_window) # Skipgram. tf_graph, optimizer, loss, normalized_embeddings, similarity = \ models.skipgram_model(vocabulary_size, embedding_size, batch_size_skip, num_sampled, valid_examples, learning_rate) print('Training using skipgram model...') normalized_embeddings = graph_optimisation.run_embedding(tf_graph, optimizer, loss, similarity, normalized_embeddings, skipgram_batches, valid_examples, reverse_dictionary, num_steps) training.utils.plot_embedding(normalized_embeddings[1:num_points+1],
learning_rate_decay_factor = 0.8 num_steps = 5000 verbose_frequency = 200 def create_set(words): data = [] for w in words: in_word = [dataset.utils.seq2seq_char2id(c) for c in w] # The output should be the reversed word and the # "end of sequence" symbol. out_word = in_word[::-1] + [dataset.utils.EOS_ID] data.append((in_word, out_word)) return data _, _, _, dictionary, _ = text8.prepare_dataset(vocabulary_size, data_folder) # Removing UNK token and words that are longer than the max length allowed. dictionary.pop("UNK", None) too_long = {w for w in dictionary.keys() if len(w) > max_word_length} for w in too_long: dictionary.pop(w, None) # Creating a simple dataset from the 50000 words (or less) in the vocabulary. words = list(dictionary.keys()) train_words = words[num_valid_words:] valid_words = words[:num_valid_words] train_data = create_set(train_words) valid_data = create_set(valid_words)