def process(self): categories = self.get_all_categories() for i,category in enumerate(categories): reviews = self.review_supplier.get_reviews(category) #if len(reviews) > 500: # random.shuffle(reviews) # reviews = reviews[:500] corpus = vocabulary.load_sentences(reviews) voca = vocabulary.Vocabulary(True) docs = [voca.doc_to_ids(doc) for doc in corpus] try: l = LDA(10, 0.5, 0.5, docs, voca.size(), False) lda_learning(l, 20, voca) except Exception, e: print "Error generating data for %d"%category d = self.get_word_topic_dist(l, voca) import pdb; pdb.set_trace() self.insert_term_topic_frequencies(category, d) if i%10 == 0: print "%d finished"%i
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) lineNum=0; corpus=[]; with open('../data/deals.txt','r') as fp: for line in fp: lineNum +=1 if (lineNum%1000)==0: print "processed " + repr(lineNum) + " lines!" if len(line) ==0: continue line=line.strip() final_words=[]; toks = nltk.regexp_tokenize(line, sentence_re) postoks = nltk.tag.pos_tag(toks) tree = chunker.parse(postoks) # we only care about noun pharases terms = get_terms(tree) # terms = [normalise(w) for w in toks if acceptable_word(w)] for term in terms: for word in term: final_words.append(word); corpus +=[final_words]; fp.closed voca = vocabulary.Vocabulary(nltk.corpus.stopwords.words('english')) docs = [voca.doc_to_ids(doc) for doc in corpus] # call LDA method ldaTest = lda.LDA(20, 0.5, 0.5, docs, voca.size()) lda.lda_learning(ldaTest,50,voca)
for corpusfile in corpuslist: print corpusfile corpus = open(sys.argv[2]+'/'+corpusfile, 'r').readlines() corpuswrite1 = open(sys.argv[3]+'/'+corpusfile+'.phi', 'w') corpuswrite2 = open(sys.argv[3]+'/'+corpusfile+'.theta', 'w') docs = [] for line in corpus: line = line.strip() words = line.split() tmp_docs = [] if len(words) == 0: docs.append([]) for tmp in words: tmp_docs.append(int(tmp)) docs.append(tmp_docs) vv = open(sys.argv[2]+'/'+corpusfile+'.vocab', 'r').readlines() voca = [] for line in vv: line = line.strip() words = line.split() voca.append(words[2]) K = 2 alpha = 0.05 # bigger alpha = smoother theta beta = 0.05 iteration = 2 print 'alpha:' + str(alpha) + 'beta:' + str(beta) + 'iteration:' + str(iteration) lda0 = lda.LDA(K, alpha, beta, docs, len(voca)) lda_learning(lda0, iteration, voca, corpuswrite1, corpuswrite2)
print corpusfile corpus = open(sys.argv[2] + '/' + corpusfile, 'r').readlines() corpuswrite1 = open(sys.argv[3] + '/' + corpusfile + '.phi', 'w') corpuswrite2 = open(sys.argv[3] + '/' + corpusfile + '.theta', 'w') docs = [] for line in corpus: line = line.strip() words = line.split() tmp_docs = [] if len(words) == 0: docs.append([]) for tmp in words: tmp_docs.append(int(tmp)) docs.append(tmp_docs) vv = open(sys.argv[2] + '/' + corpusfile + '.vocab', 'r').readlines() voca = [] for line in vv: line = line.strip() words = line.split() voca.append(words[2]) K = 2 alpha = 0.05 # bigger alpha = smoother theta beta = 0.05 iteration = 2 print 'alpha:' + str(alpha) + 'beta:' + str(beta) + 'iteration:' + str( iteration) lda0 = lda.LDA(K, alpha, beta, docs, len(voca)) lda_learning(lda0, iteration, voca, corpuswrite1, corpuswrite2)