Ejemplo n.º 1
0
def corrBrands(lda,brandListFileName=r".\wordlists\brands.txt"):
    brands= [b for b in bf.getMakes(brandListFileName)if b.find(' ')==-1]
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    
    #try if brands are not in dict
    br_ids=[]; bad_brands=[]
    for (i,brt) in enumerate(br_tokens):    
        try:
            ID=lda.id2word.token2id[brt]
            print brands[i] ,lda.id2word.dfs[ID]
        except KeyError:
            print 'no '+ brt + ' in dict'
            bad_brands.append(brands[br_tokens.index(brt)])
    #update
    brands=sorted(list(set(brands)-set(bad_brands)))
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    br_ids=[lda.id2word.token2id[brt] for brt in br_tokens]        
            
    topics = lda.state.get_lambda()
    topics = [topic / topic.sum() for topic in topics]
    l=(len(brands));
    sims = numpy.zeros((l,l))
    for i in xrange(l):
        for j in xrange(l):
                p= sum([t[br_ids[i]]*t[br_ids[j]] for t in topics])
                sims[i,j]= p
                
    return (sims,brands)
len([c for c in authPostCount.values() if c > 5])  #53,068

s = r"Z:\ermunds\results\1 prices paid\5-6-2013\PricesStemmed20passes_20topics.dict"
dict1 = gensim.corpora.dictionary.Dictionary().load(s)

tupl = []
for ID in dict1.keys():
    tupl.append((dict1.dfs[ID], dict1[ID]))
tupl = sorted(tupl, reverse=True)

with open('wordCounts', 'w') as f:
    for t in tupl:
        f.write(str(t) + '\n')

lst = []
for b in bf.getMakes():
    token = gslib.wordCleanUp(gslib.textCleanUp(b))
    try:
        ID = dict1.token2id[token]
        fr = dict1.dfs[ID]
        print b, fr, token
        lst.append((fr, b))
    except KeyError:
        print b, 'fail', token

lst = sorted(lst)
fname = 'brand_mentions_count.txt'
with open(fname, 'w') as outfile:
    for t in lst:
        outfile.write(t[1] + ":" + str(t[0]) + '\n')

s = r"Z:\ermunds\results\1 prices paid\5-6-2013\PricesStemmed20passes_20topics.dict"
dict1 = gensim.corpora.dictionary.Dictionary().load(s)

tupl = []
for ID in dict1.keys():
     tupl.append((dict1.dfs[ID],dict1[ID]))   
tupl=sorted(tupl,reverse=True)

with open('wordCounts','w') as f:
    for t in tupl:
        f.write(str(t)+'\n')
        
lst= []
for b in bf.getMakes():
    token = gslib.wordCleanUp(gslib.textCleanUp(b))
    try:    
        ID=dict1.token2id[token]
        fr=dict1.dfs[ID]
        print b,fr,token
        lst.append((fr,b))
    except KeyError:
        print b,'fail',token
        
lst = sorted(lst)
fname = 'brand_mentions_count.txt'
with open(fname,'w') as outfile:
    for t in lst:
        outfile.write(t[1]+":"+str(t[0])+'\n')