コード例 #1
0
def corrBrands(lda,brandListFileName=r".\wordlists\brands.txt"):
    brands= [b for b in bf.getMakes(brandListFileName)if b.find(' ')==-1]
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    
    #try if brands are not in dict
    br_ids=[]; bad_brands=[]
    for (i,brt) in enumerate(br_tokens):    
        try:
            ID=lda.id2word.token2id[brt]
            print brands[i] ,lda.id2word.dfs[ID]
        except KeyError:
            print 'no '+ brt + ' in dict'
            bad_brands.append(brands[br_tokens.index(brt)])
    #update
    brands=sorted(list(set(brands)-set(bad_brands)))
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    br_ids=[lda.id2word.token2id[brt] for brt in br_tokens]        
            
    topics = lda.state.get_lambda()
    topics = [topic / topic.sum() for topic in topics]
    l=(len(brands));
    sims = numpy.zeros((l,l))
    for i in xrange(l):
        for j in xrange(l):
                p= sum([t[br_ids[i]]*t[br_ids[j]] for t in topics])
                sims[i,j]= p
                
    return (sims,brands)
コード例 #2
0
def corrWords(lda, w1,w2):
    id1=lda.id2word.token2id[gsLib.wordCleanUp(w1)]
    id2=lda.id2word.token2id[gsLib.wordCleanUp(w2)]
    p=[]
    for i in xrange(lda.num_topics):
        topic = lda.state.get_lambda()[i]
        topic = topic / topic.sum()
        p.append(topic[id1]*topic[id2])
        #print i,w1,w2,'{:5.5f}'.format(p[-1]*1e6)
    return sum(p)
コード例 #3
0
def pruneWordsList(words, lda):
    '''
    goes through list of words, stems them and checks if the word is in dict of LDA
    if it is, appends returns it along with count in corpus and ID in dict
    possible problem - some models were calibrated without stemming and this 
    can reject a valid word because its stem is not in dict
    '''
    words_tokens = [
        gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words
    ]
    good_IDs = []
    good_words = []
    counts = []
    for (i, t) in enumerate(words_tokens):
        try:
            ID = lda.id2word.token2id[t]
            #print words[i] ,lda.id2word.dfs[ID]
            counts.append(lda.id2word.dfs[ID])
            good_IDs.append(ID)
            good_words.append(words[i])
        except KeyError:
            print 'no ' + t + ' in dict'

    df = pandas.DataFrame({
        'IDs': good_IDs,
        'Counts': counts
    },
                          index=good_words)
    return df
コード例 #4
0
def pruneWordsList(words,lda):
    '''
    goes through list of words, stems them and checks if the word is in dict of LDA
    if it is, appends returns it along with count in corpus and ID in dict
    possible problem - some models were calibrated without stemming and this 
    can reject a valid word because its stem is not in dict
    '''
    words_tokens=   [gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words]
    good_IDs=[];good_words=[];counts=[]
    for (i,t) in enumerate(words_tokens):    
        try:
            ID=lda.id2word.token2id[t]
            #print words[i] ,lda.id2word.dfs[ID]
            counts.append(lda.id2word.dfs[ID])
            good_IDs.append(ID)
            good_words.append(words[i])
        except KeyError:
            print 'no '+ t + ' in dict'
            
    df = pandas.DataFrame({'IDs':good_IDs,'Counts':counts},index = good_words)
    return df
コード例 #5
0
import someBrandFiltering as bf

allthreads = bf.init()

somethreads = allthreads[0:100]

authPostCount = collections.defaultdict(int)
for t in allthreads:
    for post in t.getPosts():
        author = post.msgAuthor
        authPostCount[author] += 1
len(authPostCount)  #375,569

authorStems = set()
for author in authPostCount.keys():
    authorStem = gslib.wordCleanUp(gslib.textCleanUp(author))
    authorStems.add(authorStem)
len(authorStems)  #27,3418

for author in sorted(authPostCount.keys()):
    if author.find(' ') != -1:
        print author

with open('authList.txt', 'w') as authListF:
    for author in sorted(authPostCount.keys()):
        authListF.write(author + '\n')

with open('authStemsList.txt', 'w') as authListF:
    for author in sorted(authorStems):
        authListF.write(author + '\n')
コード例 #6
0

allthreads =bf.init()

somethreads=allthreads[0:100]

authPostCount = collections.defaultdict(int)
for t in allthreads: 
    for post in t.getPosts():
        author = post.msgAuthor
        authPostCount[author]+=1
len(authPostCount) #375,569 

authorStems=set()
for author in authPostCount.keys():
    authorStem=gslib.wordCleanUp(gslib.textCleanUp(author))
    authorStems.add(authorStem)
len(authorStems) #27,3418

for author in sorted(authPostCount.keys()):
    if author.find(' ')!=-1:
        print author


with open('authList.txt','w') as authListF:
    for author in sorted(authPostCount.keys()):
        authListF.write(author+'\n')

with open('authStemsList.txt','w') as authListF:
    for author in sorted(authorStems):
        authListF.write(author+'\n')