def load_data(self, debug=False): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors self.wv, word_to_num, num_to_word = ner.load_wv( 'data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER'] self.num_to_tag = dict(enumerate(tagnames)) tag_to_num = {v:k for k,v in self.num_to_tag.iteritems()} # Load the training set docs = du.load_dataset('data/ner/train') self.X_train, self.y_train = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_train = self.X_train[:1024] self.y_train = self.y_train[:1024] # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') self.X_dev, self.y_dev = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024] # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') self.X_test, self.y_test = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size)
def load_data(self, debug=False): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors self.wv, word_to_num, num_to_word = ner.load_wv( 'data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER'] self.num_to_tag = dict(enumerate(tagnames)) tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()} # Load the training set docs = du.load_dataset('data/ner/train') self.X_train, self.y_train = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_train = self.X_train[:1024] self.y_train = self.y_train[:1024] # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') self.X_dev, self.y_dev = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024] # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') self.X_test, self.y_test = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size)
def load_data(self, debug=False, search=False): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors path_vocab = 'data/ner/vocab.txt' path_wordVectors = 'data/ner/wordVectors.txt' path_train = 'data/ner/train' path_dev = 'data/ner/dev' path_test = 'data/ner/test.masked' if search: currentdir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) path_vocab = currentdir + "/" + path_vocab path_wordVectors = currentdir + "/" + path_wordVectors path_train = currentdir + "/" + path_train path_dev = currentdir + "/" + path_dev path_test = currentdir + "/" + path_test self.wv, word_to_num, num_to_word = ner.load_wv( path_vocab, path_wordVectors) tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER'] self.num_to_tag = dict(enumerate(tagnames)) tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()} # Load the training set docs = du.load_dataset(path_train) self.X_train, self.y_train = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_train = self.X_train[:1024] self.y_train = self.y_train[:1024] # Load the dev set (for tuning hyperparameters) docs = du.load_dataset(path_dev) self.X_dev, self.y_dev = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024] # Load the test set (dummy labels only) docs = du.load_dataset(path_test) self.X_test, self.y_test = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size)
def load_data_as_sentences(path, word_to_num): """ Converts the training data to an array of integer arrays. args: path: string pointing to the training data word_to_num: A dictionary from string words to integers returns: An array of integer arrays. Each array is a sentence and each integer is a word. """ docs_data = du.load_dataset(path) S_data = du.docs_to_indices(docs_data, word_to_num) return docs_data, S_data
def load_data(self, debug=False): self.wv, word_to_num, num_to_word = ner.load_wv( 'data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER'] self.num_to_tag = dict(enumerate(tagnames)) tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()} docs = du.load_dataset('data/ner/train') self.X_train, self.y_train = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_train = self.X_train[:1024] self.Y_train = self.Y_train[:1024] docs = du.load_dataset('data/ner/dev') self.X_dev, self.y_dev = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size) if debug: self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024] docs = du.load_dataset('data/ner/test.masked') self.X_test, self.y_test = du.docs_to_windows( docs, word_to_num, tag_to_num, wsize=self.config.window_size)
def load_data(self, debug=False): self.wv, word2num, num2word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') self.wv = self.wv.astype(np.float32) tags = ["O", "LOC", "MISC", "ORG", "PER"] self.num2tag = dict(enumerate(tags)) tag2num = dict(zip(self.num2tag.values(), self.num2tag.keys())) docs = du.load_dataset('data/ner/train') self.X_train, self.y_train = du.docs_to_windows( docs, word2num, tag2num, wsize=self.config.window_size) if debug: self.X_train = self.X_train[:1024] self.y_train = self.y_train[:1024] docs = du.load_dataset('data/ner/dev') self.X_dev, self.y_dev = du.docs_to_windows( docs, word2num, tag2num, wsize=self.config.window_size) if debug: self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024] docs = du.load_dataset('data/ner/test.masked') self.X_test, self.y_test = du.docs_to_windows( docs, word2num, tag2num, wsize=self.config.window_size)
class EssayGraderModel(Model): def load_data(self, debug=False): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors self.num_uniquewords #to recieve # Load the training set self.X_train, self.y_train #to receive if debug: self.X_train = self.X_train[:1024] self.y_train = self.y_train[:1024] # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') self.X_dev, self.y_dev = #to receive if debug: self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024] # Load the test set (dummy labels only) self.X_test, self.y_test #to receive
import data_utils.utils as du import data_utils.ner as ner # Load the starter word vectors wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) # Set window size windowsize = 3 # Load the training set docs = du.load_dataset('data/ner/train') X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num,
# Load the vocabulary vocab = pd.read_table( "data/lm/vocab.ptb.txt", header=None, sep="\s+", index_col=0, names=['count', 'freq'], ) # Choose how many top words to keep vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) # Load the training set docs_train = du.load_dataset('data/lm/ptb-train.txt') S_train = du.docs_to_indices(docs_train, word_to_num) docs_dev = du.load_dataset('data/lm/ptb-dev.txt') S_dev = du.docs_to_indices(docs_dev, word_to_num) def train_ngrams(dataset): """ Gets an array of arrays of indexes, each one corresponds to a word. Returns trigram, bigram, unigram and total counts. """ trigram_counts = dict() bigram_counts = dict() unigram_counts = dict() token_count = 0 ### YOUR CODE HERE
import itertools from numpy import * from multiprocessing import Pool import random as rdm random.seed(10) wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) windowsize = 3 docs = du.load_dataset('data/ner/train') X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) nepoch = 5 N = nepoch * len(y_train) k = 5 # minibatch size schedules = ["epoch", "N", "mini_batch"] sche_params = [] for sche_name in schedules:
random.seed(10) print random_weight_matrix(3,5) # Load the starter word vectors wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') # wv - matrix with word vectors N x D tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) windowsize = 3 # Load the training set docs = du.load_dataset('data/ner/train') # [[sentence1 = [w1, tag1], [w2, tag2]...], [sentence2], ...] X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) print "Shape of data X = %s y = %s" % (X_train.shape, y_train.shape) print X_train[0:3], y_train[0:3] # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # To avoid re-inventing the wheel, we provide a base class that handles a lot of the drudgery of # managing parameters and running gradient descent. It's based on the classifier API used by
vocabsize = 6000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) fraction_lost = float( sum([ vocab['count'][word] for word in vocab.index if (not word in word_to_num) and (not word == "UUUNKKK") ])) fraction_lost /= sum([ vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK") ]) print "Retained %d words from %d (%.02f%% of all tokens)" % ( vocabsize, len(vocab), 100 * (1 - fraction_lost)) docs = du.load_dataset('data/lm/ptb-train.txt') S_train = du.docs_to_indices(docs, word_to_num) X_train, Y_train = du.seqs_to_lmXY(S_train) docs = du.load_dataset('data/lm/ptb-dev.txt') S_dev = du.docs_to_indices(docs, word_to_num) X_dev, Y_dev = du.seqs_to_lmXY(S_dev) docs = du.load_dataset('data/lm/ptb-test.txt') S_test = du.docs_to_indices(docs, word_to_num) X_test, Y_test = du.seqs_to_lmXY(S_test) #print " ".join(d[0] for d in docs[7]) #print " ".join(num_to_word[i] for i in S_test[7]) #For random samples from N(mu, sigma^2), use:
return new if __name__ == "__main__": # Load the vocabulary vocab = pd.read_table("data/dictionary", header=None, sep="\s+", index_col=0, names=['count', 'freq'], ) # Choose how many top words to keep vocabsize = len(vocab) print 'vocabulary size %d' % vocabsize #vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) print 'load dictionary done' docs = du.load_dataset('data/rnn_input_train') S_train = du.docs_to_indices(docs, word_to_num) X_train, Y_train = du.seqs_to_lmXY(S_train) docs = du.load_dataset('data/rnn_input_test') S_train = du.docs_to_indices(docs, word_to_num) X_dev, Y_dev = du.seqs_to_lmXY(S_train) #X_train = X_train[:3000] #Y_train = Y_train[:3000] print 'load data done' print 'number of training data %d' % len(Y_train) method = "RNNPTONE" hdim = 40 # dimension of hidden layer = dimension of word vectors #random.seed(10) nepoch = 1
vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+", index_col=0, names=["count", "freq"]) # Choose how many top words to keep vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) ## # Below needed for 'adj_loss': DO NOT CHANGE fraction_lost = float( sum([vocab["count"][word] for word in vocab.index if (not word in word_to_num) and (not word == "UUUNKKK")]) ) fraction_lost /= sum([vocab["count"][word] for word in vocab.index if (not word == "UUUNKKK")]) print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab), 100 * (1 - fraction_lost)) # Load the training set docs = du.load_dataset("data/lm/ptb-train.txt") S_train = du.docs_to_indices(docs, word_to_num) X_train, Y_train = du.seqs_to_lmXY(S_train) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset("data/lm/ptb-dev.txt") S_dev = du.docs_to_indices(docs, word_to_num) X_dev, Y_dev = du.seqs_to_lmXY(S_dev) # Load the test set (final evaluation only) docs = du.load_dataset("data/lm/ptb-test.txt") S_test = du.docs_to_indices(docs, word_to_num) X_test, Y_test = du.seqs_to_lmXY(S_test) # Display some sample data print " ".join(d[0] for d in docs[7])
# Choose how many top words to keep vocabsize = 2000 num_to_word = dict(enumerate(vocab.index[:vocabsize])) word_to_num = du.invert_dict(num_to_word) ## # Below needed for 'adj_loss': DO NOT CHANGE fraction_lost = float(sum([vocab['count'][word] for word in vocab.index if (not word in word_to_num) and (not word == "UUUNKKK")])) fraction_lost /= sum([vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK")]) print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab), 100*(1-fraction_lost)) # Load the training set docs = du.load_dataset('data/lm/ptb-train.txt') S_train = du.docs_to_indices(docs, word_to_num) X_train, Y_train = du.seqs_to_lmXY(S_train) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/lm/ptb-dev.txt') S_dev = du.docs_to_indices(docs, word_to_num) X_dev, Y_dev = du.seqs_to_lmXY(S_dev) # Load the test set (final evaluation only) docs = du.load_dataset('data/lm/ptb-test.txt') S_test = du.docs_to_indices(docs, word_to_num) X_test, Y_test = du.seqs_to_lmXY(S_test) # Display some sample data #print " ".join(d[0] for d in docs[7])
def main(): # Load the starter word vectors wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) # Set window size windowsize = 3 # Load the training set docs = du.load_dataset('data/ner/train') X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) clf = WindowMLP(wv, windowsize=windowsize, dims=[None, 100, 5], reg=0.001, alpha=0.01) train_size = X_train.shape[0] """ costs = pickle.load(open("costs.dat", "rb")) clf = pickle.load(open("clf.dat", "rb")) """ nepoch = 5 N = nepoch * len(y_train) k = 5 # minibatch size costs = clf.train_sgd(X_train, y_train, idxiter=random_mini(k, N, train_size), printevery=10000, costevery=10000) pickle.dump(clf, open("clf.dat", "wb")) pickle.dump(costs, open("costs.dat", "wb")) plot_learning_curve(clf, costs) # Predict labels on the dev set yp = clf.predict(X_dev) # Save predictions to a file, one per line ner.save_predictions(yp, "dev.predicted") full_report(y_dev, yp, tagnames) # full report, helpful diagnostics eval_performance(y_dev, yp, tagnames) # performance: optimize this F1 # L: V x 50 # W[:,50:100]: 100 x 50 responses = clf.sparams.L.dot(clf.params.W[:, 50:100].T) # V x 100 index = np.argsort(responses, axis=0)[::-1] neurons = [1, 3, 4, 6, 8] # change this to your chosen neurons for i in neurons: print "Neuron %d" % i top_words = [num_to_word[k] for k in index[:10, i]] top_scores = [responses[k, i] for k in index[:10, i]] print_scores(top_scores, top_words)
# In[4]: # Load the starter word vectors wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt') #wv:(100232,50) #word_to_num:dict,100232 tagnames = ["O", "LOC", "MISC", "ORG", "PER"] num_to_tag = dict(enumerate(tagnames)) tag_to_num = du.invert_dict(num_to_tag) # Set window size windowsize = 3 # Load the training set docs = du.load_dataset('data/ner/train') X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the dev set (for tuning hyperparameters) docs = du.load_dataset('data/ner/dev') X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # Load the test set (dummy labels only) docs = du.load_dataset('data/ner/test.masked') X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize) # In[5]: