def corrBrands(lda,brandListFileName=r".\wordlists\brands.txt"): brands= [b for b in bf.getMakes(brandListFileName)if b.find(' ')==-1] br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands] #try if brands are not in dict br_ids=[]; bad_brands=[] for (i,brt) in enumerate(br_tokens): try: ID=lda.id2word.token2id[brt] print brands[i] ,lda.id2word.dfs[ID] except KeyError: print 'no '+ brt + ' in dict' bad_brands.append(brands[br_tokens.index(brt)]) #update brands=sorted(list(set(brands)-set(bad_brands))) br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands] br_ids=[lda.id2word.token2id[brt] for brt in br_tokens] topics = lda.state.get_lambda() topics = [topic / topic.sum() for topic in topics] l=(len(brands)); sims = numpy.zeros((l,l)) for i in xrange(l): for j in xrange(l): p= sum([t[br_ids[i]]*t[br_ids[j]] for t in topics]) sims[i,j]= p return (sims,brands)
def corrWords(lda, w1,w2): id1=lda.id2word.token2id[gsLib.wordCleanUp(w1)] id2=lda.id2word.token2id[gsLib.wordCleanUp(w2)] p=[] for i in xrange(lda.num_topics): topic = lda.state.get_lambda()[i] topic = topic / topic.sum() p.append(topic[id1]*topic[id2]) #print i,w1,w2,'{:5.5f}'.format(p[-1]*1e6) return sum(p)
def pruneWordsList(words, lda): ''' goes through list of words, stems them and checks if the word is in dict of LDA if it is, appends returns it along with count in corpus and ID in dict possible problem - some models were calibrated without stemming and this can reject a valid word because its stem is not in dict ''' words_tokens = [ gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words ] good_IDs = [] good_words = [] counts = [] for (i, t) in enumerate(words_tokens): try: ID = lda.id2word.token2id[t] #print words[i] ,lda.id2word.dfs[ID] counts.append(lda.id2word.dfs[ID]) good_IDs.append(ID) good_words.append(words[i]) except KeyError: print 'no ' + t + ' in dict' df = pandas.DataFrame({ 'IDs': good_IDs, 'Counts': counts }, index=good_words) return df
def pruneWordsList(words,lda): ''' goes through list of words, stems them and checks if the word is in dict of LDA if it is, appends returns it along with count in corpus and ID in dict possible problem - some models were calibrated without stemming and this can reject a valid word because its stem is not in dict ''' words_tokens= [gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words] good_IDs=[];good_words=[];counts=[] for (i,t) in enumerate(words_tokens): try: ID=lda.id2word.token2id[t] #print words[i] ,lda.id2word.dfs[ID] counts.append(lda.id2word.dfs[ID]) good_IDs.append(ID) good_words.append(words[i]) except KeyError: print 'no '+ t + ' in dict' df = pandas.DataFrame({'IDs':good_IDs,'Counts':counts},index = good_words) return df
import someBrandFiltering as bf allthreads = bf.init() somethreads = allthreads[0:100] authPostCount = collections.defaultdict(int) for t in allthreads: for post in t.getPosts(): author = post.msgAuthor authPostCount[author] += 1 len(authPostCount) #375,569 authorStems = set() for author in authPostCount.keys(): authorStem = gslib.wordCleanUp(gslib.textCleanUp(author)) authorStems.add(authorStem) len(authorStems) #27,3418 for author in sorted(authPostCount.keys()): if author.find(' ') != -1: print author with open('authList.txt', 'w') as authListF: for author in sorted(authPostCount.keys()): authListF.write(author + '\n') with open('authStemsList.txt', 'w') as authListF: for author in sorted(authorStems): authListF.write(author + '\n')
allthreads =bf.init() somethreads=allthreads[0:100] authPostCount = collections.defaultdict(int) for t in allthreads: for post in t.getPosts(): author = post.msgAuthor authPostCount[author]+=1 len(authPostCount) #375,569 authorStems=set() for author in authPostCount.keys(): authorStem=gslib.wordCleanUp(gslib.textCleanUp(author)) authorStems.add(authorStem) len(authorStems) #27,3418 for author in sorted(authPostCount.keys()): if author.find(' ')!=-1: print author with open('authList.txt','w') as authListF: for author in sorted(authPostCount.keys()): authListF.write(author+'\n') with open('authStemsList.txt','w') as authListF: for author in sorted(authorStems): authListF.write(author+'\n')