Ejemplo n.º 1
0
    def load_data(self, debug=False):
        """Loads starter word-vectors and train/dev/test data."""
        # Load the starter word vectors
        self.wv, word_to_num, num_to_word = ner.load_wv(
            'data/ner/vocab.txt', 'data/ner/wordVectors.txt')
        tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
        self.num_to_tag = dict(enumerate(tagnames))
        tag_to_num = {v: k for k, v in self.num_to_tag.items()}

        # Load the training set
        docs = du.load_dataset('data/ner/train')
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]

        # Load the dev set (for tuning hyperparameters)
        docs = du.load_dataset('data/ner/dev')
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]

        # Load the test set (dummy labels only)
        docs = du.load_dataset('data/ner/test.masked')
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Ejemplo n.º 2
0
  def load_data(self, debug=False):
    """Loads starter word-vectors and train/dev/test data."""
    # Load the starter word vectors
    self.wv, word_to_num, num_to_word = ner.load_wv(
      'data/ner/vocab.txt', 'data/ner/wordVectors.txt')
    tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
    self.num_to_tag = dict(enumerate(tagnames))
    tag_to_num = {v:k for k,v in self.num_to_tag.iteritems()}

    # Load the training set
    docs = du.load_dataset('data/ner/train')
    self.X_train, self.y_train = du.docs_to_windows(
        docs, word_to_num, tag_to_num, wsize=self.config.window_size)
    if debug:
      self.X_train = self.X_train[:1024]
      self.y_train = self.y_train[:1024]

    # Load the dev set (for tuning hyperparameters)
    docs = du.load_dataset('data/ner/dev')
    self.X_dev, self.y_dev = du.docs_to_windows(
        docs, word_to_num, tag_to_num, wsize=self.config.window_size)
    if debug:
      self.X_dev = self.X_dev[:1024]
      self.y_dev = self.y_dev[:1024]

    # Load the test set (dummy labels only)
    docs = du.load_dataset('data/ner/test.masked')
    self.X_test, self.y_test = du.docs_to_windows(
        docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Ejemplo n.º 3
0
    def load_data(self, debug=False, search=False):
        """Loads starter word-vectors and train/dev/test data."""
        # Load the starter word vectors
        path_vocab = 'data/ner/vocab.txt'
        path_wordVectors = 'data/ner/wordVectors.txt'
        path_train = 'data/ner/train'
        path_dev = 'data/ner/dev'
        path_test = 'data/ner/test.masked'
        if search:
            currentdir = os.path.dirname(
                os.path.abspath(inspect.getfile(inspect.currentframe())))
            path_vocab = currentdir + "/" + path_vocab
            path_wordVectors = currentdir + "/" + path_wordVectors
            path_train = currentdir + "/" + path_train
            path_dev = currentdir + "/" + path_dev
            path_test = currentdir + "/" + path_test
        self.wv, word_to_num, num_to_word = ner.load_wv(
            path_vocab, path_wordVectors)
        tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
        self.num_to_tag = dict(enumerate(tagnames))
        tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()}

        # Load the training set
        docs = du.load_dataset(path_train)
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]

        # Load the dev set (for tuning hyperparameters)
        docs = du.load_dataset(path_dev)
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]

        # Load the test set (dummy labels only)
        docs = du.load_dataset(path_test)
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Ejemplo n.º 4
0
    def load_data(self, debug=False):
        self.wv, word_to_num, num_to_word = ner.load_wv(
            'data/ner/vocab.txt', 'data/ner/wordVectors.txt')
        tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
        self.num_to_tag = dict(enumerate(tagnames))
        tag_to_num = {v: k for k, v in self.num_to_tag.iteritems()}

        docs = du.load_dataset('data/ner/train')
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.Y_train = self.Y_train[:1024]
        docs = du.load_dataset('data/ner/dev')
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]
        docs = du.load_dataset('data/ner/test.masked')
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word_to_num, tag_to_num, wsize=self.config.window_size)
Ejemplo n.º 5
0
    def load_data(self, debug=False):
        self.wv, word2num, num2word = ner.load_wv('data/ner/vocab.txt',
                                                  'data/ner/wordVectors.txt')
        self.wv = self.wv.astype(np.float32)
        tags = ["O", "LOC", "MISC", "ORG", "PER"]
        self.num2tag = dict(enumerate(tags))
        tag2num = dict(zip(self.num2tag.values(), self.num2tag.keys()))
        docs = du.load_dataset('data/ner/train')
        self.X_train, self.y_train = du.docs_to_windows(
            docs, word2num, tag2num, wsize=self.config.window_size)
        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]

        docs = du.load_dataset('data/ner/dev')
        self.X_dev, self.y_dev = du.docs_to_windows(
            docs, word2num, tag2num, wsize=self.config.window_size)
        if debug:
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]

        docs = du.load_dataset('data/ner/test.masked')
        self.X_test, self.y_test = du.docs_to_windows(
            docs, word2num, tag2num, wsize=self.config.window_size)
import data_utils.ner as ner

# Load the starter word vectors
wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')
tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

# Set window size
windowsize = 3

# Load the training set
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs,
                                      word_to_num,
                                      tag_to_num,
                                      wsize=windowsize)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs,
                                  word_to_num,
                                  tag_to_num,
                                  wsize=windowsize)

# Load the test set (dummy labels only)
docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs,
                                    word_to_num,
                                    tag_to_num,
                                    wsize=windowsize)
from numpy import *
from multiprocessing import Pool
import random as rdm

random.seed(10)

wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')

tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

windowsize = 3
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)

docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)

docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=windowsize)


nepoch = 5
N = nepoch * len(y_train)
k = 5 # minibatch size
schedules = ["epoch", "N", "mini_batch"]
sche_params = []
for sche_name in schedules:
    param = {"param": {"wv": wv, "windowsize":windowsize, "dims":[None, 100, 5], "reg":0.001, "alpha":0.01}, "setting_name": sche_name}
Ejemplo n.º 8
0
def main():
    # Load the starter word vectors
    wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                               'data/ner/wordVectors.txt')
    tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
    num_to_tag = dict(enumerate(tagnames))
    tag_to_num = du.invert_dict(num_to_tag)

    # Set window size
    windowsize = 3

    # Load the training set
    docs = du.load_dataset('data/ner/train')
    X_train, y_train = du.docs_to_windows(docs,
                                          word_to_num,
                                          tag_to_num,
                                          wsize=windowsize)

    # Load the dev set (for tuning hyperparameters)
    docs = du.load_dataset('data/ner/dev')
    X_dev, y_dev = du.docs_to_windows(docs,
                                      word_to_num,
                                      tag_to_num,
                                      wsize=windowsize)

    # Load the test set (dummy labels only)
    docs = du.load_dataset('data/ner/test.masked')
    X_test, y_test = du.docs_to_windows(docs,
                                        word_to_num,
                                        tag_to_num,
                                        wsize=windowsize)
    clf = WindowMLP(wv,
                    windowsize=windowsize,
                    dims=[None, 100, 5],
                    reg=0.001,
                    alpha=0.01)
    train_size = X_train.shape[0]
    """
    costs = pickle.load(open("costs.dat", "rb"))
    clf = pickle.load(open("clf.dat", "rb"))
    """
    nepoch = 5
    N = nepoch * len(y_train)
    k = 5  # minibatch size
    costs = clf.train_sgd(X_train,
                          y_train,
                          idxiter=random_mini(k, N, train_size),
                          printevery=10000,
                          costevery=10000)

    pickle.dump(clf, open("clf.dat", "wb"))
    pickle.dump(costs, open("costs.dat", "wb"))
    plot_learning_curve(clf, costs)
    # Predict labels on the dev set
    yp = clf.predict(X_dev)
    # Save predictions to a file, one per line
    ner.save_predictions(yp, "dev.predicted")
    full_report(y_dev, yp, tagnames)  # full report, helpful diagnostics
    eval_performance(y_dev, yp, tagnames)  # performance: optimize this F1
    # L: V x 50
    # W[:,50:100]: 100 x 50
    responses = clf.sparams.L.dot(clf.params.W[:, 50:100].T)  # V x 100
    index = np.argsort(responses, axis=0)[::-1]

    neurons = [1, 3, 4, 6, 8]  # change this to your chosen neurons
    for i in neurons:
        print "Neuron %d" % i
        top_words = [num_to_word[k] for k in index[:10, i]]
        top_scores = [responses[k, i] for k in index[:10, i]]
        print_scores(top_scores, top_words)
Ejemplo n.º 9
0
# Load the starter word vectors
wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt',
                                           'data/ner/wordVectors.txt')
#wv:(100232,50)
#word_to_num:dict,100232
tagnames = ["O", "LOC", "MISC", "ORG", "PER"]
num_to_tag = dict(enumerate(tagnames))
tag_to_num = du.invert_dict(num_to_tag)

# Set window size
windowsize = 3

# Load the training set
docs = du.load_dataset('data/ner/train')
X_train, y_train = du.docs_to_windows(docs, word_to_num, tag_to_num,
                                      wsize=windowsize)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/ner/dev')
X_dev, y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num,
                                  wsize=windowsize)

# Load the test set (dummy labels only)
docs = du.load_dataset('data/ner/test.masked')
X_test, y_test = du.docs_to_windows(docs, word_to_num, tag_to_num,
                                    wsize=windowsize)


# In[5]:

print num_to_tag