def load_data_as_sentences(path, word_to_num): """ Converts the training data to an array of integer arrays. args: path: string pointing to the training data word_to_num: A dictionary from string words to integers returns: An array of integer arrays. Each array is a sentence and each integer is a word. """ docs_data = du.load_dataset(path) S_data = du.docs_to_indices(docs_data, word_to_num) return docs_data, S_data
vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) ## # Below needed for 'adj_loss': DO NOT CHANGE fraction_lost = float(sum([vocab['count'][word] for word in vocab.index if (not word in word_to_num) and (not word == "UUUNKKK")])) fraction_lost /= sum([vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK")]) print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab), 100*(1-fraction_lost)) # Load the training set docs = du.load_dataset('data/lm/ptb-train.txt') S_train = du.docs_to_indices(docs, word_to_num) X_train, Y_train = du.seqs_to_lmXY(S_train) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/lm/ptb-dev.txt') S_dev = du.docs_to_indices(docs, word_to_num) X_dev, Y_dev = du.seqs_to_lmXY(S_dev) # Load the test set (final evaluation only) docs = du.load_dataset('data/lm/ptb-test.txt') S_test = du.docs_to_indices(docs, word_to_num) X_test, Y_test = du.seqs_to_lmXY(S_test) # Display some sample data #print " ".join(d[0] for d in docs[7]) #print S_test[7]
vocab = pd.read_table( "data/lm/vocab.ptb.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], ) # Choose how many top words to keep vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) # Load the training set docs_train = du.load_dataset('data/lm/ptb-train.txt') S_train = du.docs_to_indices(docs_train, word_to_num) docs_dev = du.load_dataset('data/lm/ptb-dev.txt') S_dev = du.docs_to_indices(docs_dev, word_to_num) def train_ngrams(dataset): """ Gets an array of arrays of indexes, each one corresponds to a word. Returns trigram, bigram, unigram and total counts. """ trigram_counts = dict() bigram_counts = dict() unigram_counts = dict() token_count = 0 ### YOUR CODE HERE
num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) fraction_lost = float( sum([ vocab['count'][word] for word in vocab.index if (not word in word_to_num) and (not word == "UUUNKKK") ])) fraction_lost /= sum([ vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK") ]) print "Retained %d words from %d (%.02f%% of all tokens)" % ( vocabsize, len(vocab), 100 * (1 - fraction_lost)) docs = du.load_dataset('data/lm/ptb-train.txt') S_train = du.docs_to_indices(docs, word_to_num) X_train, Y_train = du.seqs_to_lmXY(S_train) docs = du.load_dataset('data/lm/ptb-dev.txt') S_dev = du.docs_to_indices(docs, word_to_num) X_dev, Y_dev = du.seqs_to_lmXY(S_dev) docs = du.load_dataset('data/lm/ptb-test.txt') S_test = du.docs_to_indices(docs, word_to_num) X_test, Y_test = du.seqs_to_lmXY(S_test) #print " ".join(d[0] for d in docs[7]) #print " ".join(num_to_word[i] for i in S_test[7]) #For random samples from N(mu, sigma^2), use: # sigma * np.random.randn(...) + mu