Python TrainModel Examples

Programming Language: Python

Namespace/Package Name: backprop

Class/Type: TrainModel

Examples at hotexamples.com: 3

Python TrainModel - 3 examples found. These are the top rated real world Python examples of backprop.TrainModel extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TrainModel(2)

getoutput(2)

trainonone(2)

Example #1

Show file

File: etl.py Project: yiiwood/WordEmbeddingAutoencoder

import cPickle as pickle 

pa = ProbArray()
folder = "ie_data"
filepaths = map(lambda x: folder+"/"+x,os.listdir(folder))
rgx = re.compile("([\w][\w']*\w)")
for filepath in filepaths:
	alltext = open(filepath).read().lower()
	words = filter(lambda x: len(x)<15 ,re.findall(rgx,alltext))
	for i in xrange(1,len(words)):
		pa.addcontext(words[i-1],words[i])
		pa.addcontext(words[i],words[i-1])

maxnum = 20000 #pa.wordnumrelation.maxnum
newdims = 100 #suppose 100 dimensions afterall
tm = TrainModel(maxnum,newdims)
ntimes = 2
pa.freeze()
for k in xrange(ntimes):
	for numwordvec in pa.getallwordvecs():
		tm.trainonone(numwordvec[1])
wordembeddings = {}
for numwordvec in pa.getallwordvecs():
	(num,wordvec) = numwordvec
	word = pa.wordnumrelation.getWord(num)
	embedding = tm.getoutput(wordvec)
	wordembeddings[word] = embedding
outfile = open("embeddings.pickle","w")
pickle.dump(wordembeddings,outfile)
outfile.close()

Example #2

Show file

File: train.py Project: MorLong/WordEmbeddingAutoencoder

def train(folder,contextSize=5,min_count=100, newdims=100, ntimes=2,
          maxnum=10000, lr=0.4):
    '''
    Function to train autoencoder.
    '''
    t = time.time()
    lr_decay = 0.95
    pa = ProbArray()
    # Frequency to filter out low freq words
    freq = {}
    filepaths = map(lambda x: folder + "/" + x,os.listdir(folder))
    rgx = re.compile("([\w][\w']*\w)")
    # Another iterator to count frequency of words
    print "Pre-processing (clearning garbage words)"
    for filepath in filepaths:
        text = open(filepath).read().lower()
        tokens = re.findall(rgx, text)
        N = len(tokens)
        for i in xrange(0,N):
            if tokens[i] in freq:
                freq[tokens[i]] += 1
            else:
                freq[tokens[i]] = 1

    # Sort the frequencies storing in (freq, token) pairs and prune words with freq < min_count
    tokenFreq = sorted(freq.items(), key = lambda x: x[1])
    garbageWords = []
    for item in tokenFreq:
        if item[1] < min_count:
            garbageWords.append(item[0])

    print "Generating co-occurence matrix"
    doc_text = ""
    for filepath in filepaths:
        text = open(filepath).read().lower()
        words = re.findall(rgx, text)
        N = len(words)
        temp = [' '] * (N +  contextSize)
        temp[contextSize : (contextSize + N)] = words
        words = temp
        for i in xrange(contextSize, (contextSize + N)):
            # Filter out garbage words"
            #if words[i] not in garbageWords:
            # Include context size specified by user
            for j in xrange(i-contextSize, i):
                if words[i] != ' ' and words[j] != ' ':
                        pa.addcontext(words[j], words[i])
                        pa.addcontext(words[i], words[j])

    print "Co-occurence matrix generated"
    print "Starting training"
    tm = TrainModel(maxnum, newdims)
    pa.freeze()
    for k in xrange(ntimes):
        for numwordvec in pa.getallwordvecs():
            tm.trainonone(numwordvec[1])
        lr /=float(1+k*lr_decay)

    wordembeddings = {}
    for numwordvec in pa.getallwordvecs():
        (num, wordvec) = numwordvec
        word = pa.wordnumrelation.getWord(num)
        embedding = tm.getoutput(wordvec)
        wordembeddings[word] = embedding

    print "Training proces done, dumping embedding into persistant storage!"

    outfile = open("./embeddings.pickle", "w")
    pickle.dump(wordembeddings, outfile)
    outfile.close()
    print "Training completed! Embedding done."
    print "time is %f" % (time.time()-t)

Example #3

Show file

File: train.py Project: shirish93/WordEmbeddingAutoencoder

def train(folder,
          contextSize=5,
          min_count=100,
          newdims=100,
          ntimes=2,
          maxnum=10000,
          lr=0.4):
    '''
    Function to train autoencoder.
    '''
    t = time.time()
    lr_decay = 0.95
    pa = ProbArray()
    # Frequency to filter out low freq words
    freq = {}
    filepaths = list(map(lambda x: folder + "/" + x, os.listdir(folder)))
    rgx = re.compile("([\w][\w']*\w)")
    # Another iterator to count frequency of words
    print("Pre-processing (clearning garbage words)")
    for filepath in filepaths:
        text = open(filepath).read().lower()
        tokens = re.findall(rgx, text)

        N = len(tokens)
        for i in range(0, N):
            if tokens[i] in freq:
                freq[tokens[i]] += 1
            else:
                freq[tokens[i]] = 1
    pdb.set_trace()

    # Sort the frequencies storing in (freq, token) pairs and prune words with freq < min_count
    tokenFreq = sorted(freq.items(), key=lambda x: x[1])
    garbageWords = []
    for item in tokenFreq:
        if item[1] < min_count:
            garbageWords.append(item[0])

    print("Generating co-occurence matrix")
    doc_text = ""
    for filepath in filepaths:
        text = open(filepath).read().lower()
        words = re.findall(rgx, text)
        N = len(words)
        temp = [' '] * (N + contextSize)
        temp[contextSize:(contextSize + N)] = words
        words = temp
        for i in range(contextSize, (contextSize + N)):
            # Filter out garbage words"
            #if words[i] not in garbageWords:
            # Include context size specified by user
            for j in range(i - contextSize, i):
                if words[i] != ' ' and words[j] != ' ':
                    pa.addcontext(words[j], words[i])
                    pa.addcontext(words[i], words[j])

    print("Co-occurence matrix generated")
    print("Starting training")
    tm = TrainModel(maxnum, newdims)
    pa.freeze()
    for k in range(ntimes):
        for numwordvec in pa.getallwordvecs():
            tm.trainonone(numwordvec[1])
        lr /= float(1 + k * lr_decay)

    wordembeddings = {}
    for numwordvec in pa.getallwordvecs():
        (num, wordvec) = numwordvec
        word = pa.wordnumrelation.getWord(num)
        embedding = tm.getoutput(wordvec)
        wordembeddings[word] = embedding

    print("Training proces done, dumping embedding into persistant storage!")

    with open(r'./embeddings.pickle', "wb") as outfile:
        pickle.dump(wordembeddings, outfile)
    print("Training completed! Embedding done.")
    print("time is %f" % (time.time() - t))