def topic_mod5():
	##For fetching words from topics
	parser = OptionParser()
	parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
	parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001)
	parser.add_option("-k", dest="K", type="int", help="number of topics", default=50)
	parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
	parser.add_option("-s", dest="seed", type="int", help="random seed", default=None)
	parser.add_option("-n", dest="samplesize", type="int", help="dataset sample size", default=100)
	(options, args) = parser.parse_args()
	random.seed(options.seed)
	numpy.random.seed(options.seed)
	
	idlist = random.sample(reuters.fileids(), options.samplesize)

	labels = []
	corpus = []
	labelset = []

	result_set=[];
	cur.execute("SELECT distinct topic from topic_words where category = 'Shoes'")
	for row in cur:
		topicset = []
		topicset.append(row[0])
		labels.append(topicset)
		labelset.append(row[0])
		wordlist = []
		cur1 = conn.cursor()
		cur1.execute("SELECT word from topic_words where category = 'Shoes' and topic = '"+row[0]+"' order by weight desc")
		for word in cur1:
			wordlist.append(word[0])
		print wordlist
		corpus.append([x.lower() for x in wordlist if x!='' and (x[0] in string.ascii_letters)])	

	llda = LLDA(options.K, options.alpha, options.beta)
	llda.set_corpus(labelset, corpus, labels)

	# print "M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K)

	for i in range(options.iteration):
	    sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
	    llda.inference()
	print "perplexity : %.4f" % llda.perplexity()

	phi = llda.phi()
	cur.execute("delete from topic_words where category = '"+cats+"'")
	
	for k, label in enumerate(labelset):
	    print "\n-- label %d : %s" % (k, label)
	    if (label!= "common"):
	    	for w in numpy.argsort(-phi[k])[:20]:
	    	   	print "%s: %.4f" % (llda.vocas[w], phi[k,w])
	    	   	cur.execute("INSERT INTO topic_words (category, topic, word, weight) VALUES ('Shoes', '"+label+"','"+llda.vocas[w]+"', phi[k,w]) ON DUPLICATE KEY UPDATE  weight=VALUES(weight)")	    
	cur.execute("delete from topic_detail where category = '"+cats+"'")
	cur.execute("delete from topic_date_detail where category = '"+cats+"'")
	cur.execute("Insert into topic_detail select t1.topic, t1.pos_count, t2.neg_Count, (t1.pos_count+t2.neg_Count) total_count,'Shoes' from  (select a.topic, count(b.word) pos_count from topic_words a, comment_words b where a.word = b.word    and  b.score >3 group by a.topic)t1,(select a.topic, count(b.word) neg_count from topic_words a, comment_words b where a.word = b.word    and  b.score < 3 group by a.topic) t2 where t1.topic = t2.topic") 
	cur.execute("insert into topic_date_detail select t1.topic, t1.c_date, t1.pos_count, t1.pos_words, t2.neg_count, t2.neg_words,'Shoes' from (select a.topic, b.c_date, count(a.word) pos_count, GROUP_CONCAT(distinct(b.word)) pos_words from topic_words a, comment_words b  where a.word = b.word    and  b.score >3  group by a.topic,b.c_date )t1, (select a.topic, b.c_date, count(a.word) neg_count, GROUP_CONCAT(distinct(b.word)) neg_words from topic_words a, comment_words b  where a.word = b.word    and  b.score < 3  group by a.topic,b.c_date )t2 where t1.topic = t2.topic and t1.c_date= t2.c_date") 
Exemple #2
0
def BIC_2(set_tags, posts, tags):
    tag_dict = {}
    corpus = []
    labels = []
    labelset = []
    for post in posts:
        row_p = []
        for p in post:
            # p_u = unicode(p, "utf-8")
            row_p.append(p)
        corpus.append(row_p)
    for tag in tags:
        row_t = []
        for t in tag:
            # t_u = unicode(t, "utf-8")
            row_t.append(t)
        labels.append(row_t)
    for st in set_tags:
        # st_u = unicode(st, "utf-8")
        labelset.append(st)
    llda = LLDA(options.K, options.alpha, options.beta)
    llda.set_corpus(labelset, corpus, labels)

    for i in range(options.iteration):
        sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
        llda.inference()
    print "perplexity : %.4f" % llda.perplexity()

    phi = llda.phi()
    for k, label in enumerate(set_tags):
        print "\n-- label %d : %s" % (k, label)
        vocab_dict = {}
        for w in numpy.argsort(-phi[k])[:20]:
            print "%s: %.4f" % (llda.vocas[w], phi[k,w])

            vocab_dict[llda.vocas[w]] = phi[k,w]
        for ind in range(len(label)):
            if ind == 0:
                st = label[ind]
            else:
                st = st + " " + label[ind]
        tag_dict[st] = vocab_dict
    return tag_dict
            elif line[0] in mapping_2:
                tim.append("Artificial-Intelligence")
            elif line[0] in mapping_3:
                tim.append("Information-Retreival")
            elif line[0] in mapping_4:
                tim.append("Computer-Vision")
            else:
                tim.append("Other")
#print (labels)
#labels.append(tim)
labelset = [
    "Databases", "Artificial-Intelligence", "Information-Retreival",
    "Computer-Vision", "Other"
]

llda = LLDA(options.K, options.alpha, options.beta)
llda.set_corpus(labelset, corpus, labels)

print("M=%d, V=%d, L=%d, K=%d" %
      (len(corpus), len(llda.vocas), len(labelset), options.K))

for i in range(options.iteration):
    sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
    llda.inference()
print("perplexity : %.4f" % (llda.perplexity()))

phi = llda.phi()
for k, label in enumerate(labelset):
    print("\n-- label %d : %s" % (k, label))
    for w in numpy.argsort(-phi[k])[:30]:
        print("%s " % (llda.vocas[w]), end="")
Exemple #4
0
    corpus[index].append(word)
    voc.add(word)


filenames, docs = jieba_util.docdir_handler_tfidf('sspider/data', f)
# filenames, docs = jieba_util.docdir_handler('sspider/data-160', f)
labels = [['健康', '长寿', '锻炼', '生活', '心理', '饮食']] * len(docs)
# labels = [[name.decode('GB2312').rstrip('.txt') for name in filenames]] * len(docs)
# labels = [name.decode('GB2312').rstrip('.txt').split(' ') for name in filenames]
# print corpus
# print len(corpus)
# print ', '.join([''.join(i) for i in labels])

labelset = list(set(reduce(list.__add__, labels)))

llda = LLDA(K=50, alpha=1.0 / len(labels), beta=1.0 / len(voc))
llda.set_corpus(labelset, corpus, labels)

vocab = llda.vocas
iter_count = 50

print "M=%d, V=%d, L=%d, K=%d, W=%d" % (len(corpus), len(
    llda.vocas), len(labelset), iter_count, len(vocab))

for i in range(iter_count):
    sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
    llda.inference()
print "perplexity : %.4f" % llda.perplexity()

topic_word = llda.phi()
Exemple #5
0
(options, args) = parser.parse_args()
random.seed(options.seed)
numpy.random.seed(options.seed)

idlist = random.sample(reuters.fileids(), options.samplesize)

labels = []
corpus = []
for id in idlist:
    labels.append(reuters.categories(id))
    corpus.append([x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters])
    reuters.words(id).close()
labelset = list(set(functools.reduce(list.__add__, labels)))


llda = LLDA(options.K, options.alpha, options.beta)
llda.set_corpus(labelset, corpus, labels)

print("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K))

for i in range(options.iteration):
    sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
    llda.inference()
print("perplexity : %.4f" % llda.perplexity())

phi = llda.phi()
for k, label in enumerate(labelset):
    print("\n-- label %d : %s" % (k, label))
    for w in numpy.argsort(-phi[k])[:20]:
        print("%s: %.4f" % (llda.vocas[w], phi[k,w]))
Exemple #6
0
#     sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
#     llda.inference()
# print "perplexity : %.4f" % llda.perplexity()
#
# phi = llda.phi()
# for k, label in enumerate(labelset):
#     print "\n-- label %d : %s" % (k, label)
#     for w in numpy.argsort(-phi[k])[:20]:
#         print "%s: %.4f" % (llda.vocas[w], phi[k,w])

if __name__ == "__main__":

    # 获取文章的类别
    db = MySQLdb.connect(host="localhost",
                         user="******",
                         passwd="123456",
                         db="test",
                         charset='utf8')
    cursor = db.cursor()
    sql = "select id, category from article_cat"
    cursor.execute(sql)
    results = cursor.fetchall()

    # 构建输入数据
    labels = [unicode(row[1]) for row in results]
    labelset = range(26)
    corpus = []
    with open("article_cat/seg_join/corpus.txt", "r") as corpus_file:
        corpus = [doc.decode("utf-8") for doc in corpus_file.readlines()]
    llda = LLDA(K=50, alpha=0.001, beta=0.001)
Exemple #7
0
                  default=100)
(options, args) = parser.parse_args()
random.seed(options.seed)
numpy.random.seed(options.seed)

idlist = random.sample(reuters.fileids(), options.samplesize)

labels = []
corpus = []
for id in idlist:
    labels.append(reuters.categories(id))
    corpus.append(
        [x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters])
    reuters.words(id).close()
labelset = list(set(reduce(list.__add__, labels)))

options.alpha = 50 / (len(labelset) + 1)
llda = LLDA(options.alpha, options.beta, options.K)
llda.set_corpus(corpus, labels)

print("M=%d, V=%d, L=%d, K=%d" %
      (len(corpus), len(llda.vocas), len(labelset), options.K))

llda.inference(options.iteration)

phi = llda.phi()
for k, label in enumerate(labelset):
    print("\n-- label %d : %s" % (k + 1, label))
    for w in numpy.argsort(-phi[k + 1])[:10]:
        print("%s: %.4f" % (llda.vocas[w], phi[k + 1, w]))