def corrBrands(lda,brandListFileName=r".\wordlists\brands.txt"): brands= [b for b in bf.getMakes(brandListFileName)if b.find(' ')==-1] br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands] #try if brands are not in dict br_ids=[]; bad_brands=[] for (i,brt) in enumerate(br_tokens): try: ID=lda.id2word.token2id[brt] print brands[i] ,lda.id2word.dfs[ID] except KeyError: print 'no '+ brt + ' in dict' bad_brands.append(brands[br_tokens.index(brt)]) #update brands=sorted(list(set(brands)-set(bad_brands))) br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands] br_ids=[lda.id2word.token2id[brt] for brt in br_tokens] topics = lda.state.get_lambda() topics = [topic / topic.sum() for topic in topics] l=(len(brands)); sims = numpy.zeros((l,l)) for i in xrange(l): for j in xrange(l): p= sum([t[br_ids[i]]*t[br_ids[j]] for t in topics]) sims[i,j]= p return (sims,brands)
def pruneWordsList(words, lda): ''' goes through list of words, stems them and checks if the word is in dict of LDA if it is, appends returns it along with count in corpus and ID in dict possible problem - some models were calibrated without stemming and this can reject a valid word because its stem is not in dict ''' words_tokens = [ gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words ] good_IDs = [] good_words = [] counts = [] for (i, t) in enumerate(words_tokens): try: ID = lda.id2word.token2id[t] #print words[i] ,lda.id2word.dfs[ID] counts.append(lda.id2word.dfs[ID]) good_IDs.append(ID) good_words.append(words[i]) except KeyError: print 'no ' + t + ' in dict' df = pandas.DataFrame({ 'IDs': good_IDs, 'Counts': counts }, index=good_words) return df
def pruneWordsList(words,lda): ''' goes through list of words, stems them and checks if the word is in dict of LDA if it is, appends returns it along with count in corpus and ID in dict possible problem - some models were calibrated without stemming and this can reject a valid word because its stem is not in dict ''' words_tokens= [gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words] good_IDs=[];good_words=[];counts=[] for (i,t) in enumerate(words_tokens): try: ID=lda.id2word.token2id[t] #print words[i] ,lda.id2word.dfs[ID] counts.append(lda.id2word.dfs[ID]) good_IDs.append(ID) good_words.append(words[i]) except KeyError: print 'no '+ t + ' in dict' df = pandas.DataFrame({'IDs':good_IDs,'Counts':counts},index = good_words) return df
import someBrandFiltering as bf allthreads = bf.init() somethreads = allthreads[0:100] authPostCount = collections.defaultdict(int) for t in allthreads: for post in t.getPosts(): author = post.msgAuthor authPostCount[author] += 1 len(authPostCount) #375,569 authorStems = set() for author in authPostCount.keys(): authorStem = gslib.wordCleanUp(gslib.textCleanUp(author)) authorStems.add(authorStem) len(authorStems) #27,3418 for author in sorted(authPostCount.keys()): if author.find(' ') != -1: print author with open('authList.txt', 'w') as authListF: for author in sorted(authPostCount.keys()): authListF.write(author + '\n') with open('authStemsList.txt', 'w') as authListF: for author in sorted(authorStems): authListF.write(author + '\n')
def main( outdir=r'Z:\ermunds\results\2005 20t unbranded', num_passes=2, n_repeat=10, num_topics=20, threadChoseStr='', modelTag='2005+', time_low_cutoff=time.strptime("1 Jan 2005", "%d %b %Y"), time_hi_cutoff=time.strptime("1 Jan 2006", "%d %b %Y"), ): ''' # time_low_cutoff, time_hi_cutoff posts are chosen between these two dates # threadChoseStr - filter topic names by this phrase ''' dTr = bf.notMain(threadChoseStr) modelName = modelTag + str(num_topics) + 'topics' dirs = gslib.LDAdirs(modelName, outdir) with open(dirs.dataFileName, 'a') as file1: pickle.dump(dTr, file1) ## setup logging to file and console logger = logging.getLogger('') logger.setLevel(logging.DEBUG) fh = logging.FileHandler(dirs.logFileName) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)-12s - %(levelname)s - %(message)s') ch.setFormatter(formatter) fh.setFormatter(formatter) logger.addHandler(ch) logger.addHandler(fh) ## get threads, extract post texts and save to single file # 7min per 1GB logging.log(logging.INFO, "building doc list") lineCounter = 0 with open(dirs.allDocsFileName, 'a') as docDumpFile: for Trlist in dTr.values(): for Tr in Trlist: for p in Tr.getPosts(): if (p.msgTime > time_low_cutoff) and (p.msgTime < time_hi_cutoff): doc = gslib.textCleanUp( p.msgTitle) + gslib.textCleanUp(p.msgText) lineCounter += 1 print(doc, file=docDumpFile) logging.log(logging.INFO, "total {} docs ".format(lineCounter)) #build dict 1.5H/GB dict1 = gslib.build_dict(dirs) dict1.save(dirs.dictFileName) #dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName) #pipe docfile to gensim corpus #fixme - corpusAdapter missing a len() property corpus = gslib.corpusAdapter(dirs.allDocsFileName, id2word=dict1) gensim.corpora.MmCorpus.serialize(fname=dirs.corpusFname, corpus=corpus, id2word=dict1) mm = gensim.corpora.MmCorpus(dirs.corpusFname) ## run the LDA (2h per update on 2M posts) # first runs a small step and then update 9 times saving results to disk every time lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dict1, num_topics=num_topics, update_every=0, passes=num_passes) lda.save(dirs.modelFname + "_0") for i in xrange(n_repeat - 1): lda.update(mm) # save inremediate result lda.save(dirs.modelFname + "_" + str(i + 1)) for t in lda.show_topics(-1): logging.info(str('all topics here') + t) lda.save(dirs.modelFname) logger.removeHandler(ch) logger.removeHandler(fh) return modelName
allthreads =bf.init() somethreads=allthreads[0:100] authPostCount = collections.defaultdict(int) for t in allthreads: for post in t.getPosts(): author = post.msgAuthor authPostCount[author]+=1 len(authPostCount) #375,569 authorStems=set() for author in authPostCount.keys(): authorStem=gslib.wordCleanUp(gslib.textCleanUp(author)) authorStems.add(authorStem) len(authorStems) #27,3418 for author in sorted(authPostCount.keys()): if author.find(' ')!=-1: print author with open('authList.txt','w') as authListF: for author in sorted(authPostCount.keys()): authListF.write(author+'\n') with open('authStemsList.txt','w') as authListF: for author in sorted(authorStems): authListF.write(author+'\n')
def main(outdir = r'Z:\ermunds\results\2005 20t unbranded', num_passes=2, n_repeat = 10, num_topics=20, threadChoseStr='', modelTag='2005+', time_low_cutoff=time.strptime("1 Jan 2005", "%d %b %Y"), time_hi_cutoff=time.strptime("1 Jan 2006", "%d %b %Y"), ): ''' # time_low_cutoff, time_hi_cutoff posts are chosen between these two dates # threadChoseStr - filter topic names by this phrase ''' dTr = bf.notMain(threadChoseStr) modelName = modelTag+str(num_topics)+'topics' dirs =gslib.LDAdirs(modelName,outdir) with open(dirs.dataFileName,'a') as file1: pickle.dump(dTr,file1) ## setup logging to file and console logger = logging.getLogger('') logger.setLevel(logging.DEBUG) fh = logging.FileHandler(dirs.logFileName) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)-12s - %(levelname)s - %(message)s') ch.setFormatter(formatter) fh.setFormatter(formatter) logger.addHandler(ch) logger.addHandler(fh) ## get threads, extract post texts and save to single file # 7min per 1GB logging.log(logging.INFO,"building doc list") lineCounter=0 with open(dirs.allDocsFileName,'a') as docDumpFile: for Trlist in dTr.values(): for Tr in Trlist: for p in Tr.getPosts(): if (p.msgTime>time_low_cutoff) and (p.msgTime<time_hi_cutoff): doc = gslib.textCleanUp(p.msgTitle)+gslib.textCleanUp(p.msgText) lineCounter+=1 print(doc,file = docDumpFile) logging.log(logging.INFO,"total {} docs ".format(lineCounter)) #build dict 1.5H/GB dict1 = gslib.build_dict(dirs) dict1.save(dirs.dictFileName) #dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName) #pipe docfile to gensim corpus #fixme - corpusAdapter missing a len() property corpus = gslib.corpusAdapter(dirs.allDocsFileName,id2word=dict1) gensim.corpora.MmCorpus.serialize(fname=dirs.corpusFname, corpus=corpus, id2word=dict1) mm=gensim.corpora.MmCorpus(dirs.corpusFname) ## run the LDA (2h per update on 2M posts) # first runs a small step and then update 9 times saving results to disk every time lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dict1, num_topics=num_topics, update_every=0, passes=num_passes) lda.save(dirs.modelFname+"_0") for i in xrange(n_repeat-1): lda.update(mm); # save inremediate result lda.save(dirs.modelFname+"_"+str(i+1)); for t in lda.show_topics(-1): logging.info(str('all topics here')+t); lda.save(dirs.modelFname) logger.removeHandler(ch) logger.removeHandler(fh) return modelName