Python readCorpus Examples

Programming Language: Python

Namespace/Package Name: languagemodel

Method/Function: readCorpus

Examples at hotexamples.com: 3

Python readCorpus - 3 examples found. These are the top rated real world Python examples of languagemodel.readCorpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: 4_b_multiprocessing.py Project: shaunrong/MIT_NLP

def train_nn(params):
    print(params)

    np.random.seed(1)  # for reproducibility

    corpus_train = lm.readCorpus("data/train.txt")
    corpus_dev = lm.readCorpus("data/dev.txt")
    corpus_test = lm.readCorpus("data/test.txt")

    # build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0
    # nwords = vocabulary size for the models that only see the indexes

    w2index, nwords = lm.buildIndex(corpus_train + corpus_dev + corpus_test)

    # find words that appear in the training set so we can deal with new words separately
    count_train = np.zeros((nwords,))
    for snt in corpus_train:
        for w in snt:
            count_train[w2index[w]] += 1

    # Network model
    #print("\nNetwork model training:")
    n = params[0]    # Length of n-gram
    dim = params[1]   # Word vector dimension
    hdim = params[2]  # Hidden units
    neurallm = lm.neuralLM(dim, n, hdim, nwords)  # The network model

    ngrams = lm.ngramGen(corpus_train, w2index, n)
    ngrams2 = lm.ngramGen(corpus_dev, w2index, n)

    lrate = 0.5  # Learning rate
    best_LL = float('-inf')
    it = 0
    while True:
        it += 1
        LL, N = 0.0, 0  # Average log-likelihood, number of ngrams
        for ng in ngrams:
            pr = neurallm.update(ng, lrate)
            LL += np.log(pr)
            N += 1
        #print('Train:\t{0}\tLL = {1}'.format(it, LL / N))

        #Dev set
        LL, N = 0.0, 0 # Average log-likelihood, number of ngrams
        for ng in ngrams2:
            if (count_train[ng[-1]]>0): # for now, skip target words not seen in training
                pr = neurallm.prob(ng)
                LL += np.log(pr)
                N += 1

        if LL / N > best_LL:
            best_LL = LL / N
        else:
            break

    return_result = {(params[0], params[1], params[2],): (it, best_LL)}

    with open('data/{}_{}_{}.pkl'.format(params[0], params[1], params[2]), 'w') as f:
        f.write(cPickle.dumps(return_result))

Example #2

Show file

File: 6.py Project: shaunrong/MIT_NLP

def train_nn():

    np.random.seed(1)  # for reproducibility

    corpus_train = lm.readCorpus("data/train.txt")
    corpus_dev = lm.readCorpus("data/dev.txt")
    corpus_test = lm.readCorpus("data/test.txt")
    test_1 = lm.readCorpus("data/test_1.txt")
    test_2 = lm.readCorpus("data/test_2.txt")

    # build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0
    # nwords = vocabulary size for the models that only see the indexes

    w2index, nwords = lm.buildIndex(corpus_train + corpus_dev + corpus_test + test_1 + test_2)

    # find words that appear in the training set so we can deal with new words separately
    count_train = np.zeros((nwords,))
    for snt in corpus_train:
        for w in snt:
            count_train[w2index[w]] += 1

    # Network model
    #print("\nNetwork model training:")
    n = 3    # Length of n-gram
    dim = 14   # Word vector dimension
    hdim = 38  # Hidden units
    neurallm = lm.neuralLM(dim, n, hdim, nwords)  # The network model

    ngrams = lm.ngramGen(corpus_train, w2index, n)
    ngrams_1 = lm.ngramGen(test_1, w2index, n)
    ngrams_2 = lm.ngramGen(test_2, w2index, n)

    lrate = 0.5  # Learning rate
    for it in xrange(8):  # passes through the training data
        LL, N = 0.0, 0  # Average log-likelihood, number of ngrams
        for ng in ngrams:
            pr = neurallm.update(ng, lrate)
            LL += np.log(pr)
            N += 1
        print('Train:\t{0}\tLL = {1}'.format(it, LL / N))

        #Dev set
        LL, N = 0.0, 0 # Average log-likelihood, number of ngrams
        for ng in ngrams_1:
            if (count_train[ng[-1]]>0): # for now, skip target words not seen in training
                pr = neurallm.prob(ng)
                LL += np.log(pr)
                N  += 1
        print('Test_1:\t{0}\tLL = {1}'.format(it, LL / N))

        #Dev set
        LL, N = 0.0, 0 # Average log-likelihood, number of ngrams
        for ng in ngrams_2:
            if (count_train[ng[-1]]>0): # for now, skip target words not seen in training
                pr = neurallm.prob(ng)
                LL += np.log(pr)
                N  += 1
        print('Test_2:\t{0}\tLL = {1}'.format(it, LL / N))

Example #3

Show file

File: example.py Project: shaunrong/MIT_NLP

from __future__ import print_function
from __future__ import division

import numpy as np
import languagemodel as lm

np.random.seed(1)  # for reproducibility

corpus_train = lm.readCorpus("data/train.txt")
corpus_dev = lm.readCorpus("data/dev.txt")
corpus_test = lm.readCorpus("data/test.txt")

# build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0
# nwords = vocabulary size for the models that only see the indexes

w2index, nwords = lm.buildIndex(corpus_train + corpus_dev + corpus_test)

# find words that appear in the training set so we can deal with new words separately
count_train = np.zeros((nwords,))
for snt in corpus_train:
    for w in snt:
        count_train[w2index[w]] += 1

# Bi-gram model as a baseline
alpha = 0.1  # add-alpha smoothing
probB = lm.bigramLM(corpus_train, w2index, nwords, alpha)
LLB, N = 0.0, 0
bi = lm.ngramGen(corpus_dev, w2index, 2)
for w in bi:
    if count_train[w[1]] > 0:  # for now, skip target words not seen in training
        LLB += np.log(probB[w[0], w[1]])