コード例 #1
0
    def process(self):
        categories = self.get_all_categories()
        for i,category in enumerate(categories):
            reviews = self.review_supplier.get_reviews(category)
            #if len(reviews) > 500:
            #    random.shuffle(reviews)
            #    reviews = reviews[:500]
            corpus = vocabulary.load_sentences(reviews)

            voca = vocabulary.Vocabulary(True)
            docs = [voca.doc_to_ids(doc) for doc in corpus]
            try:
                l = LDA(10, 0.5, 0.5, docs, voca.size(), False)
                lda_learning(l, 20, voca)
            except Exception, e:
                print "Error generating data for %d"%category 
            d = self.get_word_topic_dist(l, voca)
            import pdb; pdb.set_trace()
            self.insert_term_topic_frequencies(category, d)
            if i%10 == 0:
                print "%d finished"%i
コード例 #2
0
ファイル: task2.py プロジェクト: jiangruoyi/exercises
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
lineNum=0;
corpus=[];
with open('../data/deals.txt','r') as fp:
    for line in fp:
        lineNum +=1
        if (lineNum%1000)==0:
            print "processed " + repr(lineNum) + " lines!"
        if len(line) ==0:
            continue
        line=line.strip()        
        final_words=[];
        toks = nltk.regexp_tokenize(line, sentence_re)
        postoks = nltk.tag.pos_tag(toks)
        tree = chunker.parse(postoks)
        # we only care about noun pharases
        terms = get_terms(tree)
#        terms = [normalise(w) for w in toks if acceptable_word(w)] 
        for term in terms:
            for word in term:
                final_words.append(word);
        corpus +=[final_words];
fp.closed
voca = vocabulary.Vocabulary(nltk.corpus.stopwords.words('english'))
docs = [voca.doc_to_ids(doc) for doc in corpus]
# call LDA method
ldaTest = lda.LDA(20, 0.5, 0.5, docs, voca.size())
lda.lda_learning(ldaTest,50,voca)
コード例 #3
0
ファイル: run_lda.py プロジェクト: wangshaonan/LDA
for corpusfile in corpuslist:
    print corpusfile
    corpus = open(sys.argv[2]+'/'+corpusfile, 'r').readlines()
    corpuswrite1 = open(sys.argv[3]+'/'+corpusfile+'.phi', 'w')
    corpuswrite2 = open(sys.argv[3]+'/'+corpusfile+'.theta', 'w')
    docs = []
    for line in corpus:
        line = line.strip()
        words = line.split()
        tmp_docs = []
        if len(words) == 0:
            docs.append([])
        for tmp in words:
            tmp_docs.append(int(tmp))
        docs.append(tmp_docs)

    vv = open(sys.argv[2]+'/'+corpusfile+'.vocab', 'r').readlines()
    voca = []
    for line in vv:
        line = line.strip()
        words = line.split()
        voca.append(words[2])

    K = 2
    alpha = 0.05   # bigger alpha = smoother theta
    beta = 0.05
    iteration = 2
    print 'alpha:' + str(alpha) + 'beta:' + str(beta) + 'iteration:' + str(iteration)
    lda0 = lda.LDA(K, alpha, beta, docs, len(voca))
    lda_learning(lda0, iteration, voca, corpuswrite1, corpuswrite2)
コード例 #4
0
    print corpusfile
    corpus = open(sys.argv[2] + '/' + corpusfile, 'r').readlines()
    corpuswrite1 = open(sys.argv[3] + '/' + corpusfile + '.phi', 'w')
    corpuswrite2 = open(sys.argv[3] + '/' + corpusfile + '.theta', 'w')
    docs = []
    for line in corpus:
        line = line.strip()
        words = line.split()
        tmp_docs = []
        if len(words) == 0:
            docs.append([])
        for tmp in words:
            tmp_docs.append(int(tmp))
        docs.append(tmp_docs)

    vv = open(sys.argv[2] + '/' + corpusfile + '.vocab', 'r').readlines()
    voca = []
    for line in vv:
        line = line.strip()
        words = line.split()
        voca.append(words[2])

    K = 2
    alpha = 0.05  # bigger alpha = smoother theta
    beta = 0.05
    iteration = 2
    print 'alpha:' + str(alpha) + 'beta:' + str(beta) + 'iteration:' + str(
        iteration)
    lda0 = lda.LDA(K, alpha, beta, docs, len(voca))
    lda_learning(lda0, iteration, voca, corpuswrite1, corpuswrite2)