Example #1
0
import cPickle as pickle 

pa = ProbArray()
folder = "ie_data"
filepaths = map(lambda x: folder+"/"+x,os.listdir(folder))
rgx = re.compile("([\w][\w']*\w)")
for filepath in filepaths:
	alltext = open(filepath).read().lower()
	words = filter(lambda x: len(x)<15 ,re.findall(rgx,alltext))
	for i in xrange(1,len(words)):
		pa.addcontext(words[i-1],words[i])
		pa.addcontext(words[i],words[i-1])

maxnum = 20000 #pa.wordnumrelation.maxnum
newdims = 100 #suppose 100 dimensions afterall
tm = TrainModel(maxnum,newdims)
ntimes = 2
pa.freeze()
for k in xrange(ntimes):
	for numwordvec in pa.getallwordvecs():
		tm.trainonone(numwordvec[1])
wordembeddings = {}
for numwordvec in pa.getallwordvecs():
	(num,wordvec) = numwordvec
	word = pa.wordnumrelation.getWord(num)
	embedding = tm.getoutput(wordvec)
	wordembeddings[word] = embedding
outfile = open("embeddings.pickle","w")
pickle.dump(wordembeddings,outfile)
outfile.close()
def train(folder,contextSize=5,min_count=100, newdims=100, ntimes=2,
          maxnum=10000, lr=0.4):
    '''
    Function to train autoencoder.
    '''
    t = time.time()
    lr_decay = 0.95
    pa = ProbArray()
    # Frequency to filter out low freq words
    freq = {}
    filepaths = map(lambda x: folder + "/" + x,os.listdir(folder))
    rgx = re.compile("([\w][\w']*\w)")
    # Another iterator to count frequency of words
    print "Pre-processing (clearning garbage words)"
    for filepath in filepaths:
        text = open(filepath).read().lower()
        tokens = re.findall(rgx, text)
        N = len(tokens)
        for i in xrange(0,N):
            if tokens[i] in freq:
                freq[tokens[i]] += 1
            else:
                freq[tokens[i]] = 1

    # Sort the frequencies storing in (freq, token) pairs and prune words with freq < min_count
    tokenFreq = sorted(freq.items(), key = lambda x: x[1])
    garbageWords = []
    for item in tokenFreq:
        if item[1] < min_count:
            garbageWords.append(item[0])

    print "Generating co-occurence matrix"
    doc_text = ""
    for filepath in filepaths:
        text = open(filepath).read().lower()
        words = re.findall(rgx, text)
        N = len(words)
        temp = [' '] * (N +  contextSize)
        temp[contextSize : (contextSize + N)] = words
        words = temp
        for i in xrange(contextSize, (contextSize + N)):
            # Filter out garbage words"
            #if words[i] not in garbageWords:
            # Include context size specified by user
            for j in xrange(i-contextSize, i):
                if words[i] != ' ' and words[j] != ' ':
                        pa.addcontext(words[j], words[i])
                        pa.addcontext(words[i], words[j])

    print "Co-occurence matrix generated"
    print "Starting training"
    tm = TrainModel(maxnum, newdims)
    pa.freeze()
    for k in xrange(ntimes):
        for numwordvec in pa.getallwordvecs():
            tm.trainonone(numwordvec[1])
        lr /=float(1+k*lr_decay)

    wordembeddings = {}
    for numwordvec in pa.getallwordvecs():
        (num, wordvec) = numwordvec
        word = pa.wordnumrelation.getWord(num)
        embedding = tm.getoutput(wordvec)
        wordembeddings[word] = embedding

    print "Training proces done, dumping embedding into persistant storage!"

    outfile = open("./embeddings.pickle", "w")
    pickle.dump(wordembeddings, outfile)
    outfile.close()
    print "Training completed! Embedding done."
    print "time is %f" % (time.time()-t)
def train(folder,
          contextSize=5,
          min_count=100,
          newdims=100,
          ntimes=2,
          maxnum=10000,
          lr=0.4):
    '''
    Function to train autoencoder.
    '''
    t = time.time()
    lr_decay = 0.95
    pa = ProbArray()
    # Frequency to filter out low freq words
    freq = {}
    filepaths = list(map(lambda x: folder + "/" + x, os.listdir(folder)))
    rgx = re.compile("([\w][\w']*\w)")
    # Another iterator to count frequency of words
    print("Pre-processing (clearning garbage words)")
    for filepath in filepaths:
        text = open(filepath).read().lower()
        tokens = re.findall(rgx, text)

        N = len(tokens)
        for i in range(0, N):
            if tokens[i] in freq:
                freq[tokens[i]] += 1
            else:
                freq[tokens[i]] = 1
    pdb.set_trace()

    # Sort the frequencies storing in (freq, token) pairs and prune words with freq < min_count
    tokenFreq = sorted(freq.items(), key=lambda x: x[1])
    garbageWords = []
    for item in tokenFreq:
        if item[1] < min_count:
            garbageWords.append(item[0])

    print("Generating co-occurence matrix")
    doc_text = ""
    for filepath in filepaths:
        text = open(filepath).read().lower()
        words = re.findall(rgx, text)
        N = len(words)
        temp = [' '] * (N + contextSize)
        temp[contextSize:(contextSize + N)] = words
        words = temp
        for i in range(contextSize, (contextSize + N)):
            # Filter out garbage words"
            #if words[i] not in garbageWords:
            # Include context size specified by user
            for j in range(i - contextSize, i):
                if words[i] != ' ' and words[j] != ' ':
                    pa.addcontext(words[j], words[i])
                    pa.addcontext(words[i], words[j])

    print("Co-occurence matrix generated")
    print("Starting training")
    tm = TrainModel(maxnum, newdims)
    pa.freeze()
    for k in range(ntimes):
        for numwordvec in pa.getallwordvecs():
            tm.trainonone(numwordvec[1])
        lr /= float(1 + k * lr_decay)

    wordembeddings = {}
    for numwordvec in pa.getallwordvecs():
        (num, wordvec) = numwordvec
        word = pa.wordnumrelation.getWord(num)
        embedding = tm.getoutput(wordvec)
        wordembeddings[word] = embedding

    print("Training proces done, dumping embedding into persistant storage!")

    with open(r'./embeddings.pickle', "wb") as outfile:
        pickle.dump(wordembeddings, outfile)
    print("Training completed! Embedding done.")
    print("time is %f" % (time.time() - t))