Esempio n. 1
0
  def load_data(self, debug=False):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    self.wv, word_to_num, num_to_word = ner.load_wv(
      'data/ner/vocab.txt', 'data/ner/wordVectors.txt')
    tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
    self.num_to_tag = dict(enumerate(tagnames))
    tag_to_num = {v:k for k,v in self.num_to_tag.iteritems()}

    # Load the training set
    docs = du.load_dataset('data/ner/train')
    self.X_train, self.y_train = du.docs_to_windows(
        docs, word_to_num, tag_to_num, wsize=self.config.window_size)
    if debug:
      self.X_train = self.X_train[:1024]
      self.y_train = self.y_train[:1024]

    # Load the dev set (for tuning hyperparameters)
    docs = du.load_dataset('data/ner/dev')
    self.X_dev, self.y_dev = du.docs_to_windows(
        docs, word_to_num, tag_to_num, wsize=self.config.window_size)
    if debug:
      self.X_dev = self.X_dev[:1024]
      self.y_dev = self.y_dev[:1024]

    # Load the test set (dummy labels only)
    docs = du.load_dataset('data/ner/test.masked')
    self.X_test, self.y_test = du.docs_to_windows(
        docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Esempio n. 2
0
    def load_data(self, debug=False):
        """Loads starter word-vectors and train/dev/test data."""
        # Load the starter word vectors
        self.wv, word_to_num, num_to_word = ner.load_wv(
            'data/ner/vocab.txt', 'data/ner/wordVectors.txt')
        tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
        self.num_to_tag = dict(enumerate(tagnames))
        tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()}

        # Load the training set
        docs = du.load_dataset('data/ner/train')
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]

        # Load the dev set (for tuning hyperparameters)
        docs = du.load_dataset('data/ner/dev')
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]

        # Load the test set (dummy labels only)
        docs = du.load_dataset('data/ner/test.masked')
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Esempio n. 3
0
    def load_data(self, debug=False, search=False):
        """Loads starter word-vectors and train/dev/test data."""
        # Load the starter word vectors
        path_vocab = 'data/ner/vocab.txt'
        path_wordVectors = 'data/ner/wordVectors.txt'
        path_train = 'data/ner/train'
        path_dev = 'data/ner/dev'
        path_test = 'data/ner/test.masked'
        if search:
            currentdir = os.path.dirname(
                os.path.abspath(inspect.getfile(inspect.currentframe())))
            path_vocab = currentdir + "/" + path_vocab
            path_wordVectors = currentdir + "/" + path_wordVectors
            path_train = currentdir + "/" + path_train
            path_dev = currentdir + "/" + path_dev
            path_test = currentdir + "/" + path_test
        self.wv, word_to_num, num_to_word = ner.load_wv(
            path_vocab, path_wordVectors)
        tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
        self.num_to_tag = dict(enumerate(tagnames))
        tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()}

        # Load the training set
        docs = du.load_dataset(path_train)
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]

        # Load the dev set (for tuning hyperparameters)
        docs = du.load_dataset(path_dev)
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]

        # Load the test set (dummy labels only)
        docs = du.load_dataset(path_test)
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Esempio n. 4
0
def load_data_as_sentences(path, word_to_num):
    """
    Converts the training data to an array of integer arrays.
      args: 
        path: string pointing to the training data
        word_to_num: A dictionary from string words to integers
      returns:
        An array of integer arrays. Each array is a sentence and each 
        integer is a word.
    """
    docs_data = du.load_dataset(path)
    S_data = du.docs_to_indices(docs_data, word_to_num)
    return docs_data, S_data
Esempio n. 5
0
    def load_data(self, debug=False):
        self.wv, word_to_num, num_to_word = ner.load_wv(
            'data/ner/vocab.txt', 'data/ner/wordVectors.txt')
        tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
        self.num_to_tag = dict(enumerate(tagnames))
        tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()}

        docs = du.load_dataset('data/ner/train')
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.Y_train = self.Y_train[:1024]
        docs = du.load_dataset('data/ner/dev')
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]
        docs = du.load_dataset('data/ner/test.masked')
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Esempio n. 6
0
    def load_data(self, debug=False):
        self.wv, word2num, num2word = ner.load_wv('data/ner/vocab.txt',
                                                  'data/ner/wordVectors.txt')
        self.wv = self.wv.astype(np.float32)
        tags = ["O", "LOC", "MISC", "ORG", "PER"]
        self.num2tag = dict(enumerate(tags))
        tag2num = dict(zip(self.num2tag.values(), self.num2tag.keys()))
        docs = du.load_dataset('data/ner/train')
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word2num, tag2num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]

        docs = du.load_dataset('data/ner/dev')
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word2num, tag2num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]

        docs = du.load_dataset('data/ner/test.masked')
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word2num, tag2num, wsize=self.config.window_size)
Esempio n. 7
0
class EssayGraderModel(Model):

  def load_data(self, debug=False):
     """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    self.num_uniquewords #to recieve
    # Load the training set
    self.X_train, self.y_train #to receive
    if debug:
      self.X_train = self.X_train[:1024]
      self.y_train = self.y_train[:1024]

    # Load the dev set (for tuning hyperparameters)
    docs = du.load_dataset('data/ner/dev')
    self.X_dev, self.y_dev = #to receive
    if debug:
      self.X_dev = self.X_dev[:1024]
      self.y_dev = self.y_dev[:1024]

    # Load the test set (dummy labels only)
    self.X_test, self.y_test #to receive
import data_utils.utils as du
import data_utils.ner as ner

# Load the starter word vectors
wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')
tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

# Set window size
windowsize = 3

# Load the training set
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs,
                                      word_to_num,
                                      tag_to_num,
                                      wsize=windowsize)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs,
                                  word_to_num,
                                  tag_to_num,
                                  wsize=windowsize)

# Load the test set (dummy labels only)
docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs,
                                    word_to_num,
Esempio n. 9
0
# Load the vocabulary
vocab = pd.read_table(
    "data/lm/vocab.ptb.txt",
    header=None,
    sep="\s+",
    index_col=0,
    names=['count', 'freq'],
)

# Choose how many top words to keep
vocabsize = 2000
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)

# Load the training set
docs_train = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs_train, word_to_num)
docs_dev = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs_dev, word_to_num)


def train_ngrams(dataset):
    """
        Gets an array of arrays of indexes, each one corresponds to a word.
        Returns trigram, bigram, unigram and total counts.
    """
    trigram_counts = dict()
    bigram_counts = dict()
    unigram_counts = dict()
    token_count = 0
    ### YOUR CODE HERE
import itertools
from numpy import *
from multiprocessing import Pool
import random as rdm

random.seed(10)

wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')

tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

windowsize = 3
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)

docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)

docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)


nepoch = 5
N = nepoch * len(y_train)
k = 5 # minibatch size
schedules = ["epoch", "N", "mini_batch"]
sche_params = []
for sche_name in schedules:
Esempio n. 11
0
File: ner1.py Progetto: framr/ml
    random.seed(10)
    print random_weight_matrix(3,5)


    # Load the starter word vectors
    wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')
    # wv - matrix with word vectors N x D
    tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
    num_to_tag = dict(enumerate(tagnames))
    tag_to_num = du.invert_dict(num_to_tag)

    windowsize = 3

    # Load the training set
    docs = du.load_dataset('data/ner/train') # [[sentence1 = [w1, tag1], [w2, tag2]...], [sentence2], ...]
    X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)
    print "Shape of data X = %s y = %s" % (X_train.shape, y_train.shape)
    print X_train[0:3], y_train[0:3]

    # Load the dev set (for tuning hyperparameters)
    docs = du.load_dataset('data/ner/dev')
    X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)

    # Load the test set (dummy labels only)
    docs = du.load_dataset('data/ner/test.masked')
    X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)


    # To avoid re-inventing the wheel, we provide a base class that handles a lot of the drudgery of 
    # managing parameters and running gradient descent. It's based on the classifier API used by 
Esempio n. 12
0
    vocabsize = 6000
    num_to_word = dict(enumerate(vocab.index[:vocabsize]))
    word_to_num = du.invert_dict(num_to_word)

    fraction_lost = float(
        sum([
            vocab['count'][word] for word in vocab.index
            if (not word in word_to_num) and (not word == "UUUNKKK")
        ]))
    fraction_lost /= sum([
        vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK")
    ])
    print "Retained %d words from %d (%.02f%% of all tokens)" % (
        vocabsize, len(vocab), 100 * (1 - fraction_lost))

    docs = du.load_dataset('data/lm/ptb-train.txt')
    S_train = du.docs_to_indices(docs, word_to_num)
    X_train, Y_train = du.seqs_to_lmXY(S_train)

    docs = du.load_dataset('data/lm/ptb-dev.txt')
    S_dev = du.docs_to_indices(docs, word_to_num)
    X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

    docs = du.load_dataset('data/lm/ptb-test.txt')
    S_test = du.docs_to_indices(docs, word_to_num)
    X_test, Y_test = du.seqs_to_lmXY(S_test)

    #print " ".join(d[0] for d in docs[7])
    #print " ".join(num_to_word[i] for i in S_test[7])

    #For random samples from N(mu, sigma^2), use:
Esempio n. 13
0
	return new


if __name__ == "__main__":

	# Load the vocabulary
	vocab = pd.read_table("data/dictionary", header=None, sep="\s+",
			     index_col=0, names=['count', 'freq'], )
	# Choose how many top words to keep
	vocabsize = len(vocab)
	print 'vocabulary size %d' % vocabsize
	#vocabsize = 2000
	num_to_word = dict(enumerate(vocab.index[:vocabsize]))
	word_to_num = du.invert_dict(num_to_word)
	print 'load dictionary done'
	docs = du.load_dataset('data/rnn_input_train')
	S_train = du.docs_to_indices(docs, word_to_num)
	X_train, Y_train = du.seqs_to_lmXY(S_train)

	docs = du.load_dataset('data/rnn_input_test')
	S_train = du.docs_to_indices(docs, word_to_num)
	X_dev, Y_dev = du.seqs_to_lmXY(S_train)
	#X_train = X_train[:3000]
	#Y_train = Y_train[:3000]
	print 'load data done'
	print 'number of training data %d' % len(Y_train)

	method = "RNNPTONE"
	hdim = 40 # dimension of hidden layer = dimension of word vectors
	#random.seed(10)
	nepoch = 1
vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+", index_col=0, names=["count", "freq"])

# Choose how many top words to keep
vocabsize = 2000
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(
    sum([vocab["count"][word] for word in vocab.index if (not word in word_to_num) and (not word == "UUUNKKK")])
)
fraction_lost /= sum([vocab["count"][word] for word in vocab.index if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab), 100 * (1 - fraction_lost))

# Load the training set
docs = du.load_dataset("data/lm/ptb-train.txt")
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset("data/lm/ptb-dev.txt")
S_dev = du.docs_to_indices(docs, word_to_num)
X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

# Load the test set (final evaluation only)
docs = du.load_dataset("data/lm/ptb-test.txt")
S_test = du.docs_to_indices(docs, word_to_num)
X_test, Y_test = du.seqs_to_lmXY(S_test)

# Display some sample data
print " ".join(d[0] for d in docs[7])
Esempio n. 15
0
# Choose how many top words to keep
vocabsize = 2000
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
                           if (not word in word_to_num)
                               and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
                      if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
                                                             100*(1-fraction_lost))

# Load the training set
docs = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs, word_to_num)
X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

# Load the test set (final evaluation only)
docs = du.load_dataset('data/lm/ptb-test.txt')
S_test = du.docs_to_indices(docs, word_to_num)
X_test, Y_test = du.seqs_to_lmXY(S_test)

# Display some sample data
#print " ".join(d[0] for d in docs[7])
Esempio n. 16
0
def main():
    # Load the starter word vectors
    wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                               'data/ner/wordVectors.txt')
    tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
    num_to_tag = dict(enumerate(tagnames))
    tag_to_num = du.invert_dict(num_to_tag)

    # Set window size
    windowsize = 3

    # Load the training set
    docs = du.load_dataset('data/ner/train')
    X_train, y_train = du.docs_to_windows(docs,
                                          word_to_num,
                                          tag_to_num,
                                          wsize=windowsize)

    # Load the dev set (for tuning hyperparameters)
    docs = du.load_dataset('data/ner/dev')
    X_dev, y_dev = du.docs_to_windows(docs,
                                      word_to_num,
                                      tag_to_num,
                                      wsize=windowsize)

    # Load the test set (dummy labels only)
    docs = du.load_dataset('data/ner/test.masked')
    X_test, y_test = du.docs_to_windows(docs,
                                        word_to_num,
                                        tag_to_num,
                                        wsize=windowsize)
    clf = WindowMLP(wv,
                    windowsize=windowsize,
                    dims=[None, 100, 5],
                    reg=0.001,
                    alpha=0.01)
    train_size = X_train.shape[0]
    """
    costs = pickle.load(open("costs.dat", "rb"))
    clf = pickle.load(open("clf.dat", "rb"))
    """
    nepoch = 5
    N = nepoch * len(y_train)
    k = 5  # minibatch size
    costs = clf.train_sgd(X_train,
                          y_train,
                          idxiter=random_mini(k, N, train_size),
                          printevery=10000,
                          costevery=10000)

    pickle.dump(clf, open("clf.dat", "wb"))
    pickle.dump(costs, open("costs.dat", "wb"))
    plot_learning_curve(clf, costs)
    # Predict labels on the dev set
    yp = clf.predict(X_dev)
    # Save predictions to a file, one per line
    ner.save_predictions(yp, "dev.predicted")
    full_report(y_dev, yp, tagnames)  # full report, helpful diagnostics
    eval_performance(y_dev, yp, tagnames)  # performance: optimize this F1
    # L: V x 50
    # W[:,50:100]: 100 x 50
    responses = clf.sparams.L.dot(clf.params.W[:, 50:100].T)  # V x 100
    index = np.argsort(responses, axis=0)[::-1]

    neurons = [1, 3, 4, 6, 8]  # change this to your chosen neurons
    for i in neurons:
        print "Neuron %d" % i
        top_words = [num_to_word[k] for k in index[:10, i]]
        top_scores = [responses[k, i] for k in index[:10, i]]
        print_scores(top_scores, top_words)
Esempio n. 17
0
# In[4]:

# Load the starter word vectors
wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')
#wv:(100232,50)
#word_to_num:dict,100232
tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

# Set window size
windowsize = 3

# Load the training set
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num,
                                      wsize=windowsize)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num,
                                  wsize=windowsize)

# Load the test set (dummy labels only)
docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num,
                                    wsize=windowsize)


# In[5]: