Beispiel #1
0
    word_to_num = du.invert_dict(num_to_word)

    fraction_lost = float(
        sum([
            vocab['count'][word] for word in vocab.index
            if (not word in word_to_num) and (not word == "UUUNKKK")
        ]))
    fraction_lost /= sum([
        vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK")
    ])
    print "Retained %d words from %d (%.02f%% of all tokens)" % (
        vocabsize, len(vocab), 100 * (1 - fraction_lost))

    docs = du.load_dataset('data/lm/ptb-train.txt')
    S_train = du.docs_to_indices(docs, word_to_num)
    X_train, Y_train = du.seqs_to_lmXY(S_train)

    docs = du.load_dataset('data/lm/ptb-dev.txt')
    S_dev = du.docs_to_indices(docs, word_to_num)
    X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

    docs = du.load_dataset('data/lm/ptb-test.txt')
    S_test = du.docs_to_indices(docs, word_to_num)
    X_test, Y_test = du.seqs_to_lmXY(S_test)

    #print " ".join(d[0] for d in docs[7])
    #print " ".join(num_to_word[i] for i in S_test[7])

    #For random samples from N(mu, sigma^2), use:
    #    sigma * np.random.randn(...) + mu
Beispiel #2
0
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
                           if (not word in word_to_num)
                               and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
                      if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
                                                             100*(1-fraction_lost))

# Load the training set
docs = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs, word_to_num)
X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

# Load the test set (final evaluation only)
docs = du.load_dataset('data/lm/ptb-test.txt')
S_test = du.docs_to_indices(docs, word_to_num)
X_test, Y_test = du.seqs_to_lmXY(S_test)

# Display some sample data
#print " ".join(d[0] for d in docs[7])
#print S_test[7]
#
Beispiel #3
0
if __name__ == "__main__":

	# Load the vocabulary
	vocab = pd.read_table("data/dictionary", header=None, sep="\s+",
			     index_col=0, names=['count', 'freq'], )
	# Choose how many top words to keep
	vocabsize = len(vocab)
	print 'vocabulary size %d' % vocabsize
	#vocabsize = 2000
	num_to_word = dict(enumerate(vocab.index[:vocabsize]))
	word_to_num = du.invert_dict(num_to_word)
	print 'load dictionary done'
	docs = du.load_dataset('data/rnn_input_train')
	S_train = du.docs_to_indices(docs, word_to_num)
	X_train, Y_train = du.seqs_to_lmXY(S_train)

	docs = du.load_dataset('data/rnn_input_test')
	S_train = du.docs_to_indices(docs, word_to_num)
	X_dev, Y_dev = du.seqs_to_lmXY(S_train)
	#X_train = X_train[:3000]
	#Y_train = Y_train[:3000]
	print 'load data done'
	print 'number of training data %d' % len(Y_train)

	method = "RNNPTONE"
	hdim = 40 # dimension of hidden layer = dimension of word vectors
	#random.seed(10)
	nepoch = 1
	N = nepoch * len(Y_train)
	k = 5 # minibatch size