def classify(dbfile, run_num, log_info=None): real = cmpcluster.load_doc_labels(dbfile) print 'sample_test %d' % run_num metrics = list() cb = CommunityBuilder(dbfile,log_info) for i in range(run_num): predicted = cb.build() metrics.append(cmp_cluster(predicted,real)) mean,std = mean_std(metrics) return (mean,std)
def classify(dbfile, run_num, log_info=None): real = cmpcluster.load_doc_labels(dbfile) print 'sample_test %d' % run_num metrics = list() cb = CommunityBuilder(dbfile, log_info) for i in range(run_num): predicted = cb.build() metrics.append(cmp_cluster(predicted, real)) mean, std = mean_std(metrics) return (mean, std)
def test_title_cluster(dbfile = '../data/sample_test.db'): from CommunityBuilder import CommunityBuilder import comdect cb = CommunityBuilder(dbfile) g = cb.load_title_wordnet(2) detect = comdect.WalkCommunityDetection(min_nodes=15) coms = detect.detect(g) print len(coms) coms = comuid2name(g,coms) rwclusters = load_topic_keyword(dbfile) purity = compare_word_cluster(coms, rwclusters) print purity
class CommunityBuilderTest(unittest.TestCase): def setUp(self): self.cb = CommunityBuilder('../data/cn-topic.db') def test_load_title_wordnet(self): g = self.cb.load_title_wordnet() print 'count of vertex and eges: %d, %d' %(g.vcount(),g.ecount())
def main(): import preproc_qqtopic import extract_keyword import worddf dbfile = "../data/time_test.db" logtimefile = "../result/time_test.log" timefile = open(logtimefile, "w") timefile.write("ndocs\ttime(sec)\n") rang = xrange(500, 7500, 500) for num in rang: files = sample_docs(num, "../data/newsgn") if os.path.exists(dbfile): os.remove(dbfile) preproc_qqtopic.prepro_topic(dbfile, "time_test", files) wdf = worddf.WordDF("c") wdf.add_docs_from_db(dbfile) wdf.close() dbcon = extract_keyword.init_db(dbfile) extract_keyword.word_preproc(dbcon) extract_keyword.title_keyword(dbcon) extract_keyword.title_df(dbcon) extract_keyword.content_keyword(dbcon) extract_keyword.topic_keyword(dbcon) dbcon.close() cb = CommunityBuilder(dbfile) time, c = 0.0, 6 print "time_test %d" % num for i in range(c): cb.build() time += cb._run_time timefile.write("%d\t%.3f\n" % (num, time / c)) timefile.close()
def main(): import pretext import extract_keyword2 import worddf dbfile = '../data/sougou.db' logsteadyfile = '../result/sougou.log' steadyfile = open(logsteadyfile,'w') steadyfile.write(out_result_header()) if not os.path.exists(dbfile): pretext.load_topiclist(dbfile,'/home/cs/download/cluster_data/sougou') eva = WordWeightEvaluation(30,'../data/worddf') ke = extract_keyword2.DBKeywordExtractor(dbfile,eva) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() cb = CommunityBuilder(dbfile) metrics = list() c = 1 real = cmpcluster.load_doc_labels(dbfile) print 'fudan' for i in range(c): print 'Time %d' % (i+1) predicted = cb.build(max_depth=5, min_doc_num=20) metrics.append(cmp_cluster(predicted,real)) mean,std = mean_std(metrics) meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean) stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std) steadyfile.write(meanstr) steadyfile.write(stdstr) steadyfile.close() os.system('emacs '+logsteadyfile)
def main(): import preproc_qqtopic import extract_keyword2 import worddf dbfile = '../data/steady_test.db' logsteadyfile = '../result/steady_test.log' steadyfile = open(logsteadyfile,'w') steadyfile.write(out_result_header()) if not os.path.exists(dbfile): preproc_qqtopic.load_topiclist(dbfile,'../data/topicgj') ke = extract_keyword2.DBKeywordExtractor(dbfile) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() cb = CommunityBuilder(dbfile) metrics = list() c = 50 real = cmpcluster.load_doc_labels(dbfile) print 'steady_test' for i in range(c): print 'Time %d' % (i+1) predicted = cb.build() metrics.append(cmp_cluster(predicted,real)) mean,std = mean_std(metrics) meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean) stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std) steadyfile.write(meanstr) steadyfile.write(stdstr) steadyfile.close() os.system('emacs '+logsteadyfile)
def main(): import preproc_qqtopic import extract_keyword import worddf dbfile = '../data/time_test.db' logtimefile = '../result/time_test.log' timefile = open(logtimefile, 'w') timefile.write('ndocs\ttime(sec)\n') rang = xrange(500, 7500, 500) for num in rang: files = sample_docs(num, '../data/newsgn') if os.path.exists(dbfile): os.remove(dbfile) preproc_qqtopic.prepro_topic(dbfile, 'time_test', files) wdf = worddf.WordDF('c') wdf.add_docs_from_db(dbfile) wdf.close() dbcon = extract_keyword.init_db(dbfile) extract_keyword.word_preproc(dbcon) extract_keyword.title_keyword(dbcon) extract_keyword.title_df(dbcon) extract_keyword.content_keyword(dbcon) extract_keyword.topic_keyword(dbcon) dbcon.close() cb = CommunityBuilder(dbfile) time, c = 0.0, 6 print 'time_test %d' % num for i in range(c): cb.build() time += cb._run_time timefile.write('%d\t%.3f\n' % (num, time / c)) timefile.close()
def main(): import pretext import extract_keyword2 import worddf dbfile = '../data/sougou.db' logsteadyfile = '../result/sougou.log' steadyfile = open(logsteadyfile, 'w') steadyfile.write(out_result_header()) if not os.path.exists(dbfile): pretext.load_topiclist(dbfile, '/home/cs/download/cluster_data/sougou') eva = WordWeightEvaluation(30, '../data/worddf') ke = extract_keyword2.DBKeywordExtractor(dbfile, eva) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() cb = CommunityBuilder(dbfile) metrics = list() c = 1 real = cmpcluster.load_doc_labels(dbfile) print 'fudan' for i in range(c): print 'Time %d' % (i + 1) predicted = cb.build(max_depth=5, min_doc_num=20) metrics.append(cmp_cluster(predicted, real)) mean, std = mean_std(metrics) meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean) stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std) steadyfile.write(meanstr) steadyfile.write(stdstr) steadyfile.close() os.system('emacs ' + logsteadyfile)
def main(): import preproc_qqtopic import extract_keyword2 import worddf dbfile = '../data/steady_test.db' logsteadyfile = '../result/steady_test.log' steadyfile = open(logsteadyfile, 'w') steadyfile.write(out_result_header()) if not os.path.exists(dbfile): preproc_qqtopic.load_topiclist(dbfile, '../data/topicgj') ke = extract_keyword2.DBKeywordExtractor(dbfile) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() cb = CommunityBuilder(dbfile) metrics = list() c = 50 real = cmpcluster.load_doc_labels(dbfile) print 'steady_test' for i in range(c): print 'Time %d' % (i + 1) predicted = cb.build() metrics.append(cmp_cluster(predicted, real)) mean, std = mean_std(metrics) meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean) stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std) steadyfile.write(meanstr) steadyfile.write(stdstr) steadyfile.close() os.system('emacs ' + logsteadyfile)
def get_graph(filename='/home/cs/src/semantic_community/data/sample_test.db'): from CommunityBuilder import CommunityBuilder cb = CommunityBuilder(filename) graph = cb.load_title_wordnet() return graph
def setUp(self): self.cb = CommunityBuilder('../data/cn-topic.db')