def topic_mod5(): ##For fetching words from topics parser = OptionParser() parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001) parser.add_option("-k", dest="K", type="int", help="number of topics", default=50) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="seed", type="int", help="random seed", default=None) parser.add_option("-n", dest="samplesize", type="int", help="dataset sample size", default=100) (options, args) = parser.parse_args() random.seed(options.seed) numpy.random.seed(options.seed) idlist = random.sample(reuters.fileids(), options.samplesize) labels = [] corpus = [] labelset = [] result_set=[]; cur.execute("SELECT distinct topic from topic_words where category = 'Shoes'") for row in cur: topicset = [] topicset.append(row[0]) labels.append(topicset) labelset.append(row[0]) wordlist = [] cur1 = conn.cursor() cur1.execute("SELECT word from topic_words where category = 'Shoes' and topic = '"+row[0]+"' order by weight desc") for word in cur1: wordlist.append(word[0]) print wordlist corpus.append([x.lower() for x in wordlist if x!='' and (x[0] in string.ascii_letters)]) llda = LLDA(options.K, options.alpha, options.beta) llda.set_corpus(labelset, corpus, labels) # print "M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K) for i in range(options.iteration): sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) llda.inference() print "perplexity : %.4f" % llda.perplexity() phi = llda.phi() cur.execute("delete from topic_words where category = '"+cats+"'") for k, label in enumerate(labelset): print "\n-- label %d : %s" % (k, label) if (label!= "common"): for w in numpy.argsort(-phi[k])[:20]: print "%s: %.4f" % (llda.vocas[w], phi[k,w]) cur.execute("INSERT INTO topic_words (category, topic, word, weight) VALUES ('Shoes', '"+label+"','"+llda.vocas[w]+"', phi[k,w]) ON DUPLICATE KEY UPDATE weight=VALUES(weight)") cur.execute("delete from topic_detail where category = '"+cats+"'") cur.execute("delete from topic_date_detail where category = '"+cats+"'") cur.execute("Insert into topic_detail select t1.topic, t1.pos_count, t2.neg_Count, (t1.pos_count+t2.neg_Count) total_count,'Shoes' from (select a.topic, count(b.word) pos_count from topic_words a, comment_words b where a.word = b.word and b.score >3 group by a.topic)t1,(select a.topic, count(b.word) neg_count from topic_words a, comment_words b where a.word = b.word and b.score < 3 group by a.topic) t2 where t1.topic = t2.topic") cur.execute("insert into topic_date_detail select t1.topic, t1.c_date, t1.pos_count, t1.pos_words, t2.neg_count, t2.neg_words,'Shoes' from (select a.topic, b.c_date, count(a.word) pos_count, GROUP_CONCAT(distinct(b.word)) pos_words from topic_words a, comment_words b where a.word = b.word and b.score >3 group by a.topic,b.c_date )t1, (select a.topic, b.c_date, count(a.word) neg_count, GROUP_CONCAT(distinct(b.word)) neg_words from topic_words a, comment_words b where a.word = b.word and b.score < 3 group by a.topic,b.c_date )t2 where t1.topic = t2.topic and t1.c_date= t2.c_date")
def BIC_2(set_tags, posts, tags): tag_dict = {} corpus = [] labels = [] labelset = [] for post in posts: row_p = [] for p in post: # p_u = unicode(p, "utf-8") row_p.append(p) corpus.append(row_p) for tag in tags: row_t = [] for t in tag: # t_u = unicode(t, "utf-8") row_t.append(t) labels.append(row_t) for st in set_tags: # st_u = unicode(st, "utf-8") labelset.append(st) llda = LLDA(options.K, options.alpha, options.beta) llda.set_corpus(labelset, corpus, labels) for i in range(options.iteration): sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) llda.inference() print "perplexity : %.4f" % llda.perplexity() phi = llda.phi() for k, label in enumerate(set_tags): print "\n-- label %d : %s" % (k, label) vocab_dict = {} for w in numpy.argsort(-phi[k])[:20]: print "%s: %.4f" % (llda.vocas[w], phi[k,w]) vocab_dict[llda.vocas[w]] = phi[k,w] for ind in range(len(label)): if ind == 0: st = label[ind] else: st = st + " " + label[ind] tag_dict[st] = vocab_dict return tag_dict
elif line[0] in mapping_2: tim.append("Artificial-Intelligence") elif line[0] in mapping_3: tim.append("Information-Retreival") elif line[0] in mapping_4: tim.append("Computer-Vision") else: tim.append("Other") #print (labels) #labels.append(tim) labelset = [ "Databases", "Artificial-Intelligence", "Information-Retreival", "Computer-Vision", "Other" ] llda = LLDA(options.K, options.alpha, options.beta) llda.set_corpus(labelset, corpus, labels) print("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K)) for i in range(options.iteration): sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) llda.inference() print("perplexity : %.4f" % (llda.perplexity())) phi = llda.phi() for k, label in enumerate(labelset): print("\n-- label %d : %s" % (k, label)) for w in numpy.argsort(-phi[k])[:30]: print("%s " % (llda.vocas[w]), end="")
corpus[index].append(word) voc.add(word) filenames, docs = jieba_util.docdir_handler_tfidf('sspider/data', f) # filenames, docs = jieba_util.docdir_handler('sspider/data-160', f) labels = [['健康', '长寿', '锻炼', '生活', '心理', '饮食']] * len(docs) # labels = [[name.decode('GB2312').rstrip('.txt') for name in filenames]] * len(docs) # labels = [name.decode('GB2312').rstrip('.txt').split(' ') for name in filenames] # print corpus # print len(corpus) # print ', '.join([''.join(i) for i in labels]) labelset = list(set(reduce(list.__add__, labels))) llda = LLDA(K=50, alpha=1.0 / len(labels), beta=1.0 / len(voc)) llda.set_corpus(labelset, corpus, labels) vocab = llda.vocas iter_count = 50 print "M=%d, V=%d, L=%d, K=%d, W=%d" % (len(corpus), len( llda.vocas), len(labelset), iter_count, len(vocab)) for i in range(iter_count): sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) llda.inference() print "perplexity : %.4f" % llda.perplexity() topic_word = llda.phi()
(options, args) = parser.parse_args() random.seed(options.seed) numpy.random.seed(options.seed) idlist = random.sample(reuters.fileids(), options.samplesize) labels = [] corpus = [] for id in idlist: labels.append(reuters.categories(id)) corpus.append([x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters]) reuters.words(id).close() labelset = list(set(functools.reduce(list.__add__, labels))) llda = LLDA(options.K, options.alpha, options.beta) llda.set_corpus(labelset, corpus, labels) print("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K)) for i in range(options.iteration): sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) llda.inference() print("perplexity : %.4f" % llda.perplexity()) phi = llda.phi() for k, label in enumerate(labelset): print("\n-- label %d : %s" % (k, label)) for w in numpy.argsort(-phi[k])[:20]: print("%s: %.4f" % (llda.vocas[w], phi[k,w]))
# sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) # llda.inference() # print "perplexity : %.4f" % llda.perplexity() # # phi = llda.phi() # for k, label in enumerate(labelset): # print "\n-- label %d : %s" % (k, label) # for w in numpy.argsort(-phi[k])[:20]: # print "%s: %.4f" % (llda.vocas[w], phi[k,w]) if __name__ == "__main__": # 获取文章的类别 db = MySQLdb.connect(host="localhost", user="******", passwd="123456", db="test", charset='utf8') cursor = db.cursor() sql = "select id, category from article_cat" cursor.execute(sql) results = cursor.fetchall() # 构建输入数据 labels = [unicode(row[1]) for row in results] labelset = range(26) corpus = [] with open("article_cat/seg_join/corpus.txt", "r") as corpus_file: corpus = [doc.decode("utf-8") for doc in corpus_file.readlines()] llda = LLDA(K=50, alpha=0.001, beta=0.001)
default=100) (options, args) = parser.parse_args() random.seed(options.seed) numpy.random.seed(options.seed) idlist = random.sample(reuters.fileids(), options.samplesize) labels = [] corpus = [] for id in idlist: labels.append(reuters.categories(id)) corpus.append( [x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters]) reuters.words(id).close() labelset = list(set(reduce(list.__add__, labels))) options.alpha = 50 / (len(labelset) + 1) llda = LLDA(options.alpha, options.beta, options.K) llda.set_corpus(corpus, labels) print("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K)) llda.inference(options.iteration) phi = llda.phi() for k, label in enumerate(labelset): print("\n-- label %d : %s" % (k + 1, label)) for w in numpy.argsort(-phi[k + 1])[:10]: print("%s: %.4f" % (llda.vocas[w], phi[k + 1, w]))