Beispiel #1
0
def main():
    data_root = '../data/wordinds/'
    print 'starting L matrix construction'
    gw = GloveWrapper(verbose=True)
    L0 = gw.L

    train_files = os.listdir(data_root+ '/train/')

    print 'getting train and test data'
    train_x, train_y, train_D, _ = get_data(gw, train_files, 'train/')
    #pdb.set_trace()
    test_x, test_y, test_D, _ = get_data(gw, train_files, 'test/')
    D0 = np.random.randn(train_D[-1] + 1, 300)
    print 'got train and test data'

    n_epochs = 25
    train_x = train_x*n_epochs
    train_y = train_y*n_epochs
    train_D = train_D*n_epochs

    model = DRNNLM(L0, D0, U0 = L0, alpha=0.05, rseed=10, bptt=3)
    print 'constructed model, training...'
    model.custom_train_sgd(train_x,train_y, train_D, apply_to=['H','U','L','D'], printevery=5, costevery=25)
    print 'training done'

    print 'saving model'
    with open('../data/drnnlm_model.pkl', 'w') as model_file:
        pkl.dump(model, model_file)
    print 'model saved'

    for i in range(10):
        seq, J = model.generate_sequence(1, gw.get_index("SSTART"), gw.get_index("EEND"), maxlen=100)
        print " ".join(gw, seq_to_words(seq))
Beispiel #2
0
def main():
    data_root = '../data/wordinds/'
    print 'starting L matrix construction'
    gw = GloveWrapper(verbose=True)
    L0 = gw.L

    train_files = os.listdir(data_root+ '/train/')

    print 'getting train and test data'
    train_x, train_y, train_D, _ = get_data(gw, train_files, 'train/') 
    pdb.set_trace()
    test_x, test_y, test_D, _ = get_data(gw, train_files, 'test/') 
    D0 = np.random.randn(train_D[-1] + 1, 300)
    print 'got train and test data'

    n_epochs = 25
    train_x = train_x*n_epochs
    train_y = train_y*n_epochs
    train_D = train_D*n_epochs

    model = SimpleDRNNLM(L0, D0, U0 = L0, alpha=0.05, rseed=10, bptt=3)
    print 'constructed model, training...'
    model.custom_train_sgd(train_x,train_y, train_D, apply_to=['H','U','L','D'], printevery=5, costevery=25)
    print 'training done'

    print 'saving model'
    with open('../data/simple_drnnlm_model.pkl', 'w') as model_file:
        pkl.dump(model, model_file) 
    print 'model saved'

    for i in range(10):
        seq, J = model.generate_sequence(1, gw.get_index("SSTART"), gw.get_index("EEND"), maxlen=100)
        print " ".join(seq_to_words(seq))
Beispiel #3
0
def main():
    data_root = "../data/wordinds/"
    print "starting L matrix construction"
    gw = GloveWrapper(verbose=True)
    L0 = gw.L

    train_files = os.listdir(data_root + "/train/")

    print "getting train and test data"
    train_x, train_y, train_D, _ = get_data(gw, train_files, "train/")
    pdb.set_trace()
    test_x, test_y, test_D, _ = get_data(gw, train_files, "test/")
    D0 = np.random.randn(train_D[-1] + 1, 300)
    print "got train and test data"

    n_epochs = 25
    train_x = train_x * n_epochs
    train_y = train_y * n_epochs
    train_D = train_D * n_epochs

    model = SimpleDRNNLM(L0, D0, U0=L0, alpha=0.05, rseed=10, bptt=3)
    print "constructed model, training..."
    model.custom_train_sgd(train_x, train_y, train_D, apply_to=["H", "U", "L", "D"], printevery=5, costevery=25)
    print "training done"

    print "saving model"
    with open("../data/simple_drnnlm_model.pkl", "w") as model_file:
        pkl.dump(model, model_file)
    print "model saved"

    for i in range(10):
        seq, J = model.generate_sequence(1, gw.get_index("SSTART"), gw.get_index("EEND"), maxlen=100)
        print " ".join(seq_to_words(seq))
Beispiel #4
0
class TextPreprocessor():
    def __init__(self):
        self.glove_vecs = GloveWrapper(verbose=True)

    def doc_to_inds(self, doc):

        doc = doc.lower()
        doc = doc.replace('\n', ' ')

        inds = []

        for sent in nltk.tokenize.sent_tokenize(doc):
            words = nltk.word_tokenize(sent)
            sent_of_inds = []
            sent_of_inds.append(self.glove_vecs.get_index('SSTART'))
            for word in words:
                sent_of_inds.append(self.glove_vecs.get_index(word))
            sent_of_inds.append(self.glove_vecs.get_index('EEND'))
            inds.append(sent_of_inds)

        return inds