dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\2012 20topics",    modelName="201220topics")
docsfilename=dirs.allDocsFileName
(dict1,mm,lda)=gslib.loadStuff(dirs)

brands = sims.BrandsClustered_1

# decompose a post into topics and ptint them
gslib.make_sense(1,lda,mm,docsfilename)

# guess the topic of concept list
consepts= ['cheap','ugly','unrelaible']
consepts= ['young','trendy','fast','macho'] # fail
consepts= ['green','environment','sustainable','hybrid'] #n75
consepts= ['reliable','safe'] # n8
consepts= 'air hot heat cool exhaust system fan coolant temp blow'.split() # n5
ws,IDl,ID2index = gslib.world_list2IDs(dict1,consepts,tokenizef=gslib.wordCleanUp)
for t,p in lda[ [(t,1) for t in IDl] ]:
        print  '__with prob:{}% is N{}: {}'.format(int(p*100),t, ' '.join([w for _,w in lda.show_topic(t,10)]))
        

# rank brands withing a topic
brands = (sims.BrandsClustered_1) + ['leaf','prius','volt','camaro']
brands,IDl,ID2index = gslib.world_list2IDs(dict1,brands,tokenizef=gslib.wordCleanUp)
topicN=5
topic = lda.state.get_lambda()[topicN]
probss = sorted([(topic[ID],brands[ID2index[ID]]) for ID in IDl],reverse=True)
for p in probss: print p

## rank topics by brand/word
word = 'toyota'
_,IDl,_ = gslib.world_list2IDs(dict1,[word],tokenizef=gslib.wordCleanUp)
#import someBrandFiltering as bf
import ldaModel2WD as wd
import sims_csv_plotter as draw
import mess_with_sims as sims



#dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\all unbranded threads",    modelName="unbranded2passes_20topics")
dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\sink",    modelName="unbranded220topics")

dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)
mm=gensim.corpora.MmCorpus(dirs.corpusFname)

#raw_brands = bf.getMakes();
raw_brands=sims.BrandsClustered_1
brandsl,IDl,ID2index = gslib.world_list2IDs(dict1,raw_brands,tokenizef=gslib.wordCleanUp)

l_1=len(IDl)
l_2=len(IDl)
IDset_1=set(IDl)
IDset_2=set(IDl)
ID2index_1=ID2index
ID2index_2=ID2index

counter_1 = numpy.zeros((1,l_1))
counter_2 = numpy.zeros((1,l_2))
coocM= numpy.zeros((l_1,l_2))


#for bow in mm:
print 'tic'
Exemple #3
0
import numpy

dirs = gslib.LDAdirs(indir=r"Z:\ermunds\results\sink",
                     modelName="unbranded220topics")
docsfilename = dirs.allDocsFileName
(dict1, mm, lda) = gslib.loadStuff(dirs)

brands = sims.BrandsClustered_1

# decompose a post into topics and ptint them
gslib.make_sense(1, lda, mm, docsfilename)

# rank brands withing a topic
brands = (sims.BrandsClustered_1) + ['leaf', 'prius', 'volt', 'camaro']
brands, IDl, ID2index = gslib.world_list2IDs(dict1,
                                             brands,
                                             tokenizef=gslib.wordCleanUp)
topicN = 7
topic = lda.state.get_lambda()[topicN]
probss = sorted([(topic[ID], brands[ID2index[ID]]) for ID in IDl],
                reverse=True)
for p in probss:
    print p
'''
## rank topics by brand/word
word = 'toyota'
_,IDl,_ = gslib.world_list2IDs(dict1,[word],tokenizef=gslib.wordCleanUp)
topics = lda.state.get_lambda()
probs = numpy.zeros(lda.num_topics)
for i,t in enumerate(topics): probs[i] =topics[i][IDl[0]]/sum(t)
idx= numpy.argsort(-probs)
Exemple #4
0
(dict1, mm, lda) = gslib.loadStuff(dirs)

brands = sims.BrandsClustered_1

# decompose a post into topics and ptint them
gslib.make_sense(1, lda, mm, docsfilename)

# guess the topic of concept list
consepts = ['cheap', 'ugly', 'unrelaible']
consepts = ['young', 'trendy', 'fast', 'macho']  # fail
consepts = ['green', 'environment', 'sustainable', 'hybrid']  #n75
consepts = ['reliable', 'safe']  # n8
consepts = 'air hot heat cool exhaust system fan coolant temp blow'.split(
)  # n5
ws, IDl, ID2index = gslib.world_list2IDs(dict1,
                                         consepts,
                                         tokenizef=gslib.wordCleanUp)
for t, p in lda[[(t, 1) for t in IDl]]:
    print '__with prob:{}% is N{}: {}'.format(
        int(p * 100), t, ' '.join([w for _, w in lda.show_topic(t, 10)]))

# rank brands withing a topic
brands = (sims.BrandsClustered_1) + ['leaf', 'prius', 'volt', 'camaro']
brands, IDl, ID2index = gslib.world_list2IDs(dict1,
                                             brands,
                                             tokenizef=gslib.wordCleanUp)
topicN = 5
topic = lda.state.get_lambda()[topicN]
probss = sorted([(topic[ID], brands[ID2index[ID]]) for ID in IDl],
                reverse=True)
for p in probss: