Python wordCleanUpの例

プログラミング言語: Python

名前空間/パッケージ名: genSimLDAlib

メソッド/関数: wordCleanUp

hotexamples.comのコード掲載数: 6

Python wordCleanUp - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのgenSimLDAlib.wordCleanUpの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: ldaModel2WD.py プロジェクト: vasusvodorosus/LDA-ermunds

def corrBrands(lda,brandListFileName=r".\wordlists\brands.txt"):
    brands= [b for b in bf.getMakes(brandListFileName)if b.find(' ')==-1]
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    
    #try if brands are not in dict
    br_ids=[]; bad_brands=[]
    for (i,brt) in enumerate(br_tokens):    
        try:
            ID=lda.id2word.token2id[brt]
            print brands[i] ,lda.id2word.dfs[ID]
        except KeyError:
            print 'no '+ brt + ' in dict'
            bad_brands.append(brands[br_tokens.index(brt)])
    #update
    brands=sorted(list(set(brands)-set(bad_brands)))
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    br_ids=[lda.id2word.token2id[brt] for brt in br_tokens]        
            
    topics = lda.state.get_lambda()
    topics = [topic / topic.sum() for topic in topics]
    l=(len(brands));
    sims = numpy.zeros((l,l))
    for i in xrange(l):
        for j in xrange(l):
                p= sum([t[br_ids[i]]*t[br_ids[j]] for t in topics])
                sims[i,j]= p
                
    return (sims,brands)

コード例 #2

ファイルを表示

ファイル: ldaModel2WD.py プロジェクト: vasusvodorosus/LDA-ermunds

def corrWords(lda, w1,w2):
    id1=lda.id2word.token2id[gsLib.wordCleanUp(w1)]
    id2=lda.id2word.token2id[gsLib.wordCleanUp(w2)]
    p=[]
    for i in xrange(lda.num_topics):
        topic = lda.state.get_lambda()[i]
        topic = topic / topic.sum()
        p.append(topic[id1]*topic[id2])
        #print i,w1,w2,'{:5.5f}'.format(p[-1]*1e6)
    return sum(p)

コード例 #3

ファイルを表示

def pruneWordsList(words, lda):
    '''
    goes through list of words, stems them and checks if the word is in dict of LDA
    if it is, appends returns it along with count in corpus and ID in dict
    possible problem - some models were calibrated without stemming and this 
    can reject a valid word because its stem is not in dict
    '''
    words_tokens = [
        gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words
    ]
    good_IDs = []
    good_words = []
    counts = []
    for (i, t) in enumerate(words_tokens):
        try:
            ID = lda.id2word.token2id[t]
            #print words[i] ,lda.id2word.dfs[ID]
            counts.append(lda.id2word.dfs[ID])
            good_IDs.append(ID)
            good_words.append(words[i])
        except KeyError:
            print 'no ' + t + ' in dict'

    df = pandas.DataFrame({
        'IDs': good_IDs,
        'Counts': counts
    },
                          index=good_words)
    return df

コード例 #4

ファイルを表示

ファイル: getLikes.py プロジェクト: vasusvodorosus/LDA-ermunds

def pruneWordsList(words,lda):
    '''
    goes through list of words, stems them and checks if the word is in dict of LDA
    if it is, appends returns it along with count in corpus and ID in dict
    possible problem - some models were calibrated without stemming and this 
    can reject a valid word because its stem is not in dict
    '''
    words_tokens=   [gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words]
    good_IDs=[];good_words=[];counts=[]
    for (i,t) in enumerate(words_tokens):    
        try:
            ID=lda.id2word.token2id[t]
            #print words[i] ,lda.id2word.dfs[ID]
            counts.append(lda.id2word.dfs[ID])
            good_IDs.append(ID)
            good_words.append(words[i])
        except KeyError:
            print 'no '+ t + ' in dict'
            
    df = pandas.DataFrame({'IDs':good_IDs,'Counts':counts},index = good_words)
    return df

コード例 #5

ファイルを表示

ファイル: dictionary_ubgrades.py プロジェクト: vasusvodorosus/LDA-ermunds

import someBrandFiltering as bf

allthreads = bf.init()

somethreads = allthreads[0:100]

authPostCount = collections.defaultdict(int)
for t in allthreads:
    for post in t.getPosts():
        author = post.msgAuthor
        authPostCount[author] += 1
len(authPostCount)  #375,569

authorStems = set()
for author in authPostCount.keys():
    authorStem = gslib.wordCleanUp(gslib.textCleanUp(author))
    authorStems.add(authorStem)
len(authorStems)  #27,3418

for author in sorted(authPostCount.keys()):
    if author.find(' ') != -1:
        print author

with open('authList.txt', 'w') as authListF:
    for author in sorted(authPostCount.keys()):
        authListF.write(author + '\n')

with open('authStemsList.txt', 'w') as authListF:
    for author in sorted(authorStems):
        authListF.write(author + '\n')

コード例 #6

ファイルを表示

ファイル: dictionary_ubgrades.py プロジェクト: vasusvodorosus/LDA-ermunds


allthreads =bf.init()

somethreads=allthreads[0:100]

authPostCount = collections.defaultdict(int)
for t in allthreads: 
    for post in t.getPosts():
        author = post.msgAuthor
        authPostCount[author]+=1
len(authPostCount) #375,569 

authorStems=set()
for author in authPostCount.keys():
    authorStem=gslib.wordCleanUp(gslib.textCleanUp(author))
    authorStems.add(authorStem)
len(authorStems) #27,3418

for author in sorted(authPostCount.keys()):
    if author.find(' ')!=-1:
        print author


with open('authList.txt','w') as authListF:
    for author in sorted(authPostCount.keys()):
        authListF.write(author+'\n')

with open('authStemsList.txt','w') as authListF:
    for author in sorted(authorStems):
        authListF.write(author+'\n')