Example #1
0
def classify(dbfile, run_num, log_info=None):
    real = cmpcluster.load_doc_labels(dbfile)
    print 'sample_test %d' % run_num
    metrics = list()
    cb = CommunityBuilder(dbfile,log_info)
    for i in range(run_num):
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted,real))
    mean,std = mean_std(metrics)
    return (mean,std)
Example #2
0
def classify(dbfile, run_num, log_info=None):
    real = cmpcluster.load_doc_labels(dbfile)
    print 'sample_test %d' % run_num
    metrics = list()
    cb = CommunityBuilder(dbfile, log_info)
    for i in range(run_num):
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted, real))
    mean, std = mean_std(metrics)
    return (mean, std)
Example #3
0
def test_title_cluster(dbfile = '../data/sample_test.db'):
    from CommunityBuilder import CommunityBuilder
    import comdect
    cb = CommunityBuilder(dbfile)
    g = cb.load_title_wordnet(2)

    detect = comdect.WalkCommunityDetection(min_nodes=15)
    coms = detect.detect(g)
    print len(coms)
    coms = comuid2name(g,coms)
    rwclusters = load_topic_keyword(dbfile)

    purity = compare_word_cluster(coms, rwclusters)
    print purity
class CommunityBuilderTest(unittest.TestCase):
    
    def setUp(self):
        self.cb = CommunityBuilder('../data/cn-topic.db')

    def test_load_title_wordnet(self):
        g = self.cb.load_title_wordnet()
        print 'count of vertex and eges: %d, %d' %(g.vcount(),g.ecount())
Example #5
0
def main():
    import preproc_qqtopic
    import extract_keyword
    import worddf

    dbfile = "../data/time_test.db"
    logtimefile = "../result/time_test.log"

    timefile = open(logtimefile, "w")
    timefile.write("ndocs\ttime(sec)\n")
    rang = xrange(500, 7500, 500)
    for num in rang:
        files = sample_docs(num, "../data/newsgn")
        if os.path.exists(dbfile):
            os.remove(dbfile)

        preproc_qqtopic.prepro_topic(dbfile, "time_test", files)

        wdf = worddf.WordDF("c")
        wdf.add_docs_from_db(dbfile)
        wdf.close()

        dbcon = extract_keyword.init_db(dbfile)
        extract_keyword.word_preproc(dbcon)
        extract_keyword.title_keyword(dbcon)
        extract_keyword.title_df(dbcon)
        extract_keyword.content_keyword(dbcon)
        extract_keyword.topic_keyword(dbcon)
        dbcon.close()

        cb = CommunityBuilder(dbfile)
        time, c = 0.0, 6
        print "time_test %d" % num
        for i in range(c):
            cb.build()
            time += cb._run_time
        timefile.write("%d\t%.3f\n" % (num, time / c))

    timefile.close()
Example #6
0
def main():
    import pretext
    import extract_keyword2
    import worddf
    dbfile = '../data/sougou.db'
    logsteadyfile = '../result/sougou.log'
    
    steadyfile = open(logsteadyfile,'w')
    steadyfile.write(out_result_header())
    
    if not os.path.exists(dbfile):
        pretext.load_topiclist(dbfile,'/home/cs/download/cluster_data/sougou')
        eva = WordWeightEvaluation(30,'../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile,eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()


    cb = CommunityBuilder(dbfile)
    
    metrics = list()
    c = 1
    real = cmpcluster.load_doc_labels(dbfile)
    print 'fudan'
    for i in range(c):
        print 'Time %d' % (i+1)
        predicted = cb.build(max_depth=5, min_doc_num=20)
        metrics.append(cmp_cluster(predicted,real))
    
    mean,std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr =  '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs '+logsteadyfile)
Example #7
0
def main():
    import preproc_qqtopic
    import extract_keyword2
    import worddf
    dbfile = '../data/steady_test.db'
    logsteadyfile = '../result/steady_test.log'
    
    steadyfile = open(logsteadyfile,'w')
    steadyfile.write(out_result_header())
     
    if not os.path.exists(dbfile):
        preproc_qqtopic.load_topiclist(dbfile,'../data/topicgj')

        ke = extract_keyword2.DBKeywordExtractor(dbfile)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()


    cb = CommunityBuilder(dbfile)
    
    metrics = list()
    c = 50
    real = cmpcluster.load_doc_labels(dbfile)
    print 'steady_test'
    for i in range(c):
        print 'Time %d' % (i+1)
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted,real))
    
    mean,std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr =  '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs '+logsteadyfile)
Example #8
0
def main():
    import preproc_qqtopic
    import extract_keyword
    import worddf
    dbfile = '../data/time_test.db'
    logtimefile = '../result/time_test.log'

    timefile = open(logtimefile, 'w')
    timefile.write('ndocs\ttime(sec)\n')
    rang = xrange(500, 7500, 500)
    for num in rang:
        files = sample_docs(num, '../data/newsgn')
        if os.path.exists(dbfile):
            os.remove(dbfile)

        preproc_qqtopic.prepro_topic(dbfile, 'time_test', files)

        wdf = worddf.WordDF('c')
        wdf.add_docs_from_db(dbfile)
        wdf.close()

        dbcon = extract_keyword.init_db(dbfile)
        extract_keyword.word_preproc(dbcon)
        extract_keyword.title_keyword(dbcon)
        extract_keyword.title_df(dbcon)
        extract_keyword.content_keyword(dbcon)
        extract_keyword.topic_keyword(dbcon)
        dbcon.close()

        cb = CommunityBuilder(dbfile)
        time, c = 0.0, 6
        print 'time_test %d' % num
        for i in range(c):
            cb.build()
            time += cb._run_time
        timefile.write('%d\t%.3f\n' % (num, time / c))

    timefile.close()
Example #9
0
def main():
    import pretext
    import extract_keyword2
    import worddf
    dbfile = '../data/sougou.db'
    logsteadyfile = '../result/sougou.log'

    steadyfile = open(logsteadyfile, 'w')
    steadyfile.write(out_result_header())

    if not os.path.exists(dbfile):
        pretext.load_topiclist(dbfile, '/home/cs/download/cluster_data/sougou')
        eva = WordWeightEvaluation(30, '../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile, eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()

    cb = CommunityBuilder(dbfile)

    metrics = list()
    c = 1
    real = cmpcluster.load_doc_labels(dbfile)
    print 'fudan'
    for i in range(c):
        print 'Time %d' % (i + 1)
        predicted = cb.build(max_depth=5, min_doc_num=20)
        metrics.append(cmp_cluster(predicted, real))

    mean, std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs ' + logsteadyfile)
Example #10
0
def main():
    import preproc_qqtopic
    import extract_keyword2
    import worddf
    dbfile = '../data/steady_test.db'
    logsteadyfile = '../result/steady_test.log'

    steadyfile = open(logsteadyfile, 'w')
    steadyfile.write(out_result_header())

    if not os.path.exists(dbfile):
        preproc_qqtopic.load_topiclist(dbfile, '../data/topicgj')

        ke = extract_keyword2.DBKeywordExtractor(dbfile)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()

    cb = CommunityBuilder(dbfile)

    metrics = list()
    c = 50
    real = cmpcluster.load_doc_labels(dbfile)
    print 'steady_test'
    for i in range(c):
        print 'Time %d' % (i + 1)
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted, real))

    mean, std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs ' + logsteadyfile)
Example #11
0
def get_graph(filename='/home/cs/src/semantic_community/data/sample_test.db'):
    from CommunityBuilder import CommunityBuilder
    cb = CommunityBuilder(filename)
    graph = cb.load_title_wordnet()
    return graph
Example #12
0
def get_graph(filename='/home/cs/src/semantic_community/data/sample_test.db'):
    from CommunityBuilder import CommunityBuilder
    cb = CommunityBuilder(filename)
    graph = cb.load_title_wordnet()
    return graph
 def setUp(self):
     self.cb = CommunityBuilder('../data/cn-topic.db')