Ejemplo n.º 1
0
def load_wv(vocabfile, wvfile):
    wv = loadtxt(wvfile, dtype=float)
    with open(vocabfile) as fd:
        words = [line.strip() for line in fd]
    num_to_word = dict(enumerate(words))
    word_to_num = invert_dict(num_to_word)
    return wv, word_to_num, num_to_word
Ejemplo n.º 2
0
def load_wv(vocabfile, wvfile):
    wv = loadtxt(wvfile, dtype=float)
    with open(vocabfile) as fd:
        words = [line.strip() for line in fd]
    num_to_word = dict(enumerate(words))
    word_to_num = invert_dict(num_to_word)
    return wv, word_to_num, num_to_word
Ejemplo n.º 3
0
#wv_dummy = random.randn(10,50)
#model = RNNLM(L0 = wv_dummy, U0 = wv_dummy,
#              alpha=0.005, rseed=10, bptt=4)
#model.grad_check(array([1,2,3]), array([2,3,4]))

from data_utils import utils as du
import pandas as pd

# Load the vocabulary
vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+",
                     index_col=0, names=['count', 'freq'], )

# Choose how many top words to keep
vocabsize = 2000
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
                           if (not word in word_to_num)
                               and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
                      if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
                                                             100*(1-fraction_lost))

# Load the training set
docs = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)
import data_utils.utils as du
import data_utils.ner as ner

# Load the starter word vectors
wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')
tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

# Set window size
windowsize = 3

# Load the training set
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs,
                                      word_to_num,
                                      tag_to_num,
                                      wsize=windowsize)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs,
                                  word_to_num,
                                  tag_to_num,
                                  wsize=windowsize)

# Load the test set (dummy labels only)
docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs,
                                    word_to_num,
Ejemplo n.º 5
0

if __name__ == "__main__":
    # Load the vocabulary
    vocab = pd.read_table(
        "data/lm/vocab.ptb.txt",
        header=None,
        sep="\s+",
        index_col=0,
        names=['count', 'freq'],
    )

    vocabsize = 2000
    num_to_word = dict(enumerate(vocab.index[:vocabsize]))
    num_to_word_embedding = load_vocab_embeddings()
    word_to_num = du.invert_dict(num_to_word)

    # Load the training data
    _, S_train = load_data_as_sentences('data/lm/ptb-train.txt', word_to_num)
    in_word_index, out_word_index = convert_to_lm_dataset(S_train)
    assert len(in_word_index) == len(out_word_index)
    num_of_examples = len(in_word_index)

    random.seed(31415)
    np.random.seed(9265)
    in_word_index, out_word_index = shuffle_training_data(
        in_word_index, out_word_index)
    startTime = time.time()

    # Training should happen here
    # Initialize parameters randomly
import data_utils.ner as ner
from softmax_example import SoftmaxRegression
from nerwindow import WindowMLP
import itertools
from numpy import *
from multiprocessing import Pool
import random as rdm

random.seed(10)

wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')

tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

windowsize = 3
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)

docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)

docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)


nepoch = 5
N = nepoch * len(y_train)
k = 5 # minibatch size
Ejemplo n.º 7
0
def main():
    # Load the starter word vectors
    wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                               'data/ner/wordVectors.txt')
    tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
    num_to_tag = dict(enumerate(tagnames))
    tag_to_num = du.invert_dict(num_to_tag)

    # Set window size
    windowsize = 3

    # Load the training set
    docs = du.load_dataset('data/ner/train')
    X_train, y_train = du.docs_to_windows(docs,
                                          word_to_num,
                                          tag_to_num,
                                          wsize=windowsize)

    # Load the dev set (for tuning hyperparameters)
    docs = du.load_dataset('data/ner/dev')
    X_dev, y_dev = du.docs_to_windows(docs,
                                      word_to_num,
                                      tag_to_num,
                                      wsize=windowsize)

    # Load the test set (dummy labels only)
    docs = du.load_dataset('data/ner/test.masked')
    X_test, y_test = du.docs_to_windows(docs,
                                        word_to_num,
                                        tag_to_num,
                                        wsize=windowsize)
    clf = WindowMLP(wv,
                    windowsize=windowsize,
                    dims=[None, 100, 5],
                    reg=0.001,
                    alpha=0.01)
    train_size = X_train.shape[0]
    """
    costs = pickle.load(open("costs.dat", "rb"))
    clf = pickle.load(open("clf.dat", "rb"))
    """
    nepoch = 5
    N = nepoch * len(y_train)
    k = 5  # minibatch size
    costs = clf.train_sgd(X_train,
                          y_train,
                          idxiter=random_mini(k, N, train_size),
                          printevery=10000,
                          costevery=10000)

    pickle.dump(clf, open("clf.dat", "wb"))
    pickle.dump(costs, open("costs.dat", "wb"))
    plot_learning_curve(clf, costs)
    # Predict labels on the dev set
    yp = clf.predict(X_dev)
    # Save predictions to a file, one per line
    ner.save_predictions(yp, "dev.predicted")
    full_report(y_dev, yp, tagnames)  # full report, helpful diagnostics
    eval_performance(y_dev, yp, tagnames)  # performance: optimize this F1
    # L: V x 50
    # W[:,50:100]: 100 x 50
    responses = clf.sparams.L.dot(clf.params.W[:, 50:100].T)  # V x 100
    index = np.argsort(responses, axis=0)[::-1]

    neurons = [1, 3, 4, 6, 8]  # change this to your chosen neurons
    for i in neurons:
        print "Neuron %d" % i
        top_words = [num_to_word[k] for k in index[:10, i]]
        top_scores = [responses[k, i] for k in index[:10, i]]
        print_scores(top_scores, top_words)
Ejemplo n.º 8
0
#from rnn_simple import RNN_SIMPLE
#from brnn import BRNN
#from brnn_weighted import BRNN_WEIGHTED
#from rnn_weighted import RNN_WEIGHTED
from data_utils import utils as du
import pandas as pd
from misc import *
N_ASPECTS = 5
SENT_DIM = 3

# Load the vocabulary

vocab = pd.read_table("worddic.txt",header=None,sep="\s+",index_col=0)

n2w = dict(enumerate(vocab.index))
w2n = du.invert_dict(n2w)

vocabsize = len(w2n)

num2word =dict(enumerate(w2n))
word2num = du.invert_dict(num2word)
print "Number of unique words:",len(num2word)

##############

filename_train = 'x_train.txt'#'reviews_plain.txt'
filename_dev = 'x_dev.txt'
X_train = read_data(filename_train,word2num)
X_dev = read_data(filename_dev,word2num)

Ejemplo n.º 9
0
# Load the vocabulary
#vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+",
#                     index_col=0, names=['count', 'freq'], )

vocab2 = pd.read_table("worddic.txt",header=None,sep="\s+",index_col=0)

# Choose how many top words to keep
#vocabsize = 2000
vocabsize2 = 58868 #remove for implemenation
#num_to_word = dict(enumerate(vocab.index[:vocabsize]))

num_to_word2 = dict(enumerate(vocab.index[:vocabsize]))

#word_to_num = du.invert_dict(num_to_word)
word_to_num2 = du.invert_dict(num_to_word2)

#print word_to_num2

##############
filename = 'reviews_plain.txt'
print "Opening the file..."

X_train = []

f = open(filename,'r')
count = 0

for line in f.readlines():
    sentence = []
    line = line.strip()
Ejemplo n.º 10
0
dropped = 0
for i in word_vocab.keys():
    if word_vocab[i] < 5:
        dropped += word_vocab[i]
    else:
        if i not in word_to_num:
            word_to_num[i] = j
            j = j + 1

word_to_num["<s>"] = j
word_to_num["</s>"] = j + 1
mx = j + 1

print "dropped: " +  str(float(dropped)/sum(word_vocab.values()))

num_to_word = du.invert_dict(word_to_num)
vocabsize = len(num_to_word) + 2 #One for line ending and one for unknown


################################
#Prepare data for training.
################################

X_train = []
Y_train = []

for i in alltxt:
    txt = ''.join(l for l in i if ord(l) < 128)
    temparr = [word_to_num["<s>"]] # Start the sentence
    for j in nltk.word_tokenize(txt):
        if j in word_to_num:
Ejemplo n.º 11
0

if __name__ == "__main__":
    # Load the vocabulary
    vocab = pd.read_table(
        "data/lm/vocab.ptb.txt",
        header=None,
        sep="\s+",
        index_col=0,
        names=['count', 'freq'],
    )

    vocabsize = 2000
    num_to_word = dict(enumerate(vocab.index[:vocabsize]))
    num_to_word_embedding = load_vocab_embeddings()
    word_to_num = utils.invert_dict(num_to_word)

    # Load the training data
    _, S_train = load_data_as_sentences('data/lm/ptb-train.txt', word_to_num)
    in_word_index, out_word_index = convert_to_lm_dataset(S_train)
    assert len(in_word_index) == len(out_word_index)
    num_of_examples = len(in_word_index)

    random.seed(31415)
    np.random.seed(9265)
    in_word_index, out_word_index = shuffle_training_data(
        in_word_index, out_word_index)
    startTime = time.time()

    # Training should happen here
    # Initialize parameters randomly
Ejemplo n.º 12
0
dropped = 0
for i in word_vocab.keys():
    if word_vocab[i] < 5:
        dropped += word_vocab[i]
    else:
        if i not in word_to_num:
            word_to_num[i] = j
            j = j + 1

word_to_num["<s>"] = j
word_to_num["</s>"] = j + 1
mx = j + 1

print "dropped: " + str(float(dropped) / sum(word_vocab.values()))

num_to_word = du.invert_dict(word_to_num)
vocabsize = len(num_to_word) + 2  #One for line ending and one for unknown

################################
#Prepare data for training.
################################

X_train = []
Y_train = []

for i in alltxt:
    txt = ''.join(l for l in i if ord(l) < 128)
    temparr = [word_to_num["<s>"]]  # Start the sentence
    for j in nltk.word_tokenize(txt):
        if j in word_to_num:
            temparr.append(word_to_num[j])