def main(): import preproc_qqtopic import extract_keyword2 #import worddf dbfile = '../data/noise_test.db' lognoisefile = '../result/noise_test.log' noisefile = open(lognoisefile,'w') noisefile.write(out_result_header()) rang = xrange(0,250, 60) for num in rang: dbfile = '../data/noise%d_test.db' % (num,) files = sample_docs(num) if os.path.exists(dbfile): os.remove(dbfile) dbcon = preproc_qqtopic.init_db(dbfile) preproc_qqtopic.load_topiclist(dbcon,'../data/topicgj') cnt = preproc_qqtopic.load_topic(dbcon,'noise_data',files) dbcon.close() print 'add number of noise document: %d' % cnt eva = WordWeightEvaluation(30, '../data/worddf') ke = extract_keyword2.DBKeywordExtractor(dbfile, eva) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db()
def __init__(self,db_path, weight_evaluator=None): self.dbpath = db_path self.dbcon = None if weight_evaluator: self.evaluator = weight_evaluator else: self.evaluator = WordWeightEvaluation(30,'../data') self.evaluator.add_docs_from_db(dbpath)
def content_keyword(dbcon): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(dbcon, 'document', 'docid title content'): word_weight_list = eluate.extract_kw(r[1], r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(dbcon, 'document', {'kw_content': wordwstr}, {'docid': r[0]}) cnt += 1 if cnt % 20 == 0: utils.updateProgress(cnt, doc_num) print '' eluate.close() dbcon.commit()
def main(): import pretext import extract_keyword2 import worddf dbfile = '../data/sougou.db' logsteadyfile = '../result/sougou.log' steadyfile = open(logsteadyfile, 'w') steadyfile.write(out_result_header()) if not os.path.exists(dbfile): pretext.load_topiclist(dbfile, '/home/cs/download/cluster_data/sougou') eva = WordWeightEvaluation(30, '../data/worddf') ke = extract_keyword2.DBKeywordExtractor(dbfile, eva) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() cb = CommunityBuilder(dbfile) metrics = list() c = 1 real = cmpcluster.load_doc_labels(dbfile) print 'fudan' for i in range(c): print 'Time %d' % (i + 1) predicted = cb.build(max_depth=5, min_doc_num=20) metrics.append(cmp_cluster(predicted, real)) mean, std = mean_std(metrics) meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean) stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std) steadyfile.write(meanstr) steadyfile.write(stdstr) steadyfile.close() os.system('emacs ' + logsteadyfile)
def topics_to_db(topics, dbfile): import sqlite3 if os.path.exists(dbfile): os.remove(dbfile) dbcon = preproc_qqtopic.init_db(dbfile) for t in topics: idx = t.rindex('/') if idx == len(t) - 1: idx = t.rindex('/', 0, idx) tname = t[idx + 1:] files = os.listdir(t) filelist = [os.path.join(t, f) for f in files] preproc_qqtopic.load_topic(dbcon, tname, filelist) dbcon.close() evaluator = WordWeightEvaluation(30) ke = extract_keyword2.DBKeywordExtractor(dbfile, evaluator) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() return dbfile