def buildGloVe(author, vocabulary = 'vocabulary.txt', rebuild = False): ''' build GloVe model. Fine tune the model paramter here ''' if rebuild or not os.path.exists('%s%s.txt'%(util.modelDir, author)): print 'Building GloVe model for %s -----------' % author docFname = 'corpus' util.gathterDocs([author], docFname) # build occurance file # ../glove/cooccur -memory 4.0 -vocab-file vocabulary -verbose 2 -window-size 15 < docFname > coocur.bin cmd = '../glove/cooccur' param = '-memory 4.0 -vocab-file %s -verbose 2 -window-size 15 < %s > coocur.bin' % (util.modelDir+vocabulary, docFname) os.system(cmd+' '+param) # shuffle occurance file # ../glove/shuffle -memory 4.0 -verbose 2 < coocur.bin > shuffle.bin cmd = '../glove/shuffle' param = '-memory 4.0 -verbose 2 < coocur.bin > shuffle.bin' os.system(cmd+' '+param) # build glove # ../glove/glove -save-file glove_+$author -threads 4 -input-file shuffle.bin -x-max 10 -iter 50 -vector-size 50 -binary 2 -vocab-file vocabulary -verbose 2 cmd = '../glove/glove' param = '-save-file glove_%s -threads 4 -input-file shuffle.bin -x-max 10 -iter 50 -vector-size 50 -binary 2 -vocab-file %s -verbose 2' % ("ModelFiles/"+author, util.modelDir+vocabulary) os.system(cmd+' '+param) os.remove(docFname) os.remove('shuffle.bin') os.remove('coocur.bin')
def buildVocabulary(rebuild = False): # check if the vocabulary is there and if rebuild if rebuild or not os.path.exists(util.modelDir+'vocabulary.txt'): vocFileName = 'voc.txt' util.gathterDocs(util.authors, vocFileName) print 'voc.txt done' # execute ../glove/vocab_count -min-count 5 -verbose 2 < vocFileName > vocabulary.txt cmd = '../glove/vocab_count' param = '-min-count 5 -verbose 2 < %s > %svocabulary.txt' % (vocFileName, util.modelDir) os.system(cmd+' '+param) os.remove(vocFileName)
def buildNgramModels(): outputFile = 'out.txt' for author in [util.authors[0]]: print util.authors[:1] estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) util.gathterDocs([author], outputFile) f = open(outputFile) train = f.read()#.split() f.close() ngrammodel = NM(5, train, estimator = estimator) pickle.dump(ngrammodel, open(author+'_ngram.model', 'w')) os.remove(outputFile)