Ejemplo n.º 1
0
def main():
    import sys
    dbfile = sys.argv[1]
    algor = sys.argv[2]
    #dbfile = '../data/topicgj.db'
    s,e = dbfile.rindex('/')+1, dbfile.rindex('.')
    outfile = '../result/%s_orange_fmt.tab' % dbfile[s:e]
    real_labels = cmpcluster.load_doc_labels(dbfile)
    if not os.path.exists(outfile):
        transfer_data_file(dbfile,outfile)
    
    #os.system('emacs '+ outfile)
    
    res_outfile = '../result/%s_cluster_result.txt' % algor
    data = load_data_to_array(outfile)
    if algor == 'kmeans':
        kmeans_cluster_test(data, real_labels, res_outfile)
    elif algor == 'tree':
        tree_cluster_test(data, real_labels, res_outfile)
    elif algor == 'som':
        som_cluster_test(data, real_labels, res_outfile)
    elif algor == 'dbscan':
        dbscan_cluster_test(data,real_labels, res_outfile)
    else:
        raise NotImplementedError
    os.system('emacs '+res_outfile)
Ejemplo n.º 2
0
def main():
    import sys
    dbfile = sys.argv[1]
    algor = sys.argv[2]
    #dbfile = '../data/topicgj.db'
    s, e = dbfile.rindex('/') + 1, dbfile.rindex('.')
    outfile = '../result/%s_orange_fmt.tab' % dbfile[s:e]
    real_labels = cmpcluster.load_doc_labels(dbfile)
    if not os.path.exists(outfile):
        transfer_data_file(dbfile, outfile)

    #os.system('emacs '+ outfile)

    res_outfile = '../result/%s_cluster_result.txt' % algor
    data = load_data_to_array(outfile)
    if algor == 'kmeans':
        kmeans_cluster_test(data, real_labels, res_outfile)
    elif algor == 'tree':
        tree_cluster_test(data, real_labels, res_outfile)
    elif algor == 'som':
        som_cluster_test(data, real_labels, res_outfile)
    elif algor == 'dbscan':
        dbscan_cluster_test(data, real_labels, res_outfile)
    else:
        raise NotImplementedError
    os.system('emacs ' + res_outfile)
Ejemplo n.º 3
0
def classify(dbfile, run_num, log_info=None):
    real = cmpcluster.load_doc_labels(dbfile)
    print 'sample_test %d' % run_num
    metrics = list()
    cb = CommunityBuilder(dbfile, log_info)
    for i in range(run_num):
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted, real))
    mean, std = mean_std(metrics)
    return (mean, std)
Ejemplo n.º 4
0
def classify(dbfile, run_num, log_info=None):
    real = cmpcluster.load_doc_labels(dbfile)
    print 'sample_test %d' % run_num
    metrics = list()
    cb = CommunityBuilder(dbfile,log_info)
    for i in range(run_num):
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted,real))
    mean,std = mean_std(metrics)
    return (mean,std)
Ejemplo n.º 5
0
def main():
    import pretext
    import extract_keyword2
    import worddf
    dbfile = '../data/sougou.db'
    logsteadyfile = '../result/sougou.log'
    
    steadyfile = open(logsteadyfile,'w')
    steadyfile.write(out_result_header())
    
    if not os.path.exists(dbfile):
        pretext.load_topiclist(dbfile,'/home/cs/download/cluster_data/sougou')
        eva = WordWeightEvaluation(30,'../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile,eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()


    cb = CommunityBuilder(dbfile)
    
    metrics = list()
    c = 1
    real = cmpcluster.load_doc_labels(dbfile)
    print 'fudan'
    for i in range(c):
        print 'Time %d' % (i+1)
        predicted = cb.build(max_depth=5, min_doc_num=20)
        metrics.append(cmp_cluster(predicted,real))
    
    mean,std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr =  '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs '+logsteadyfile)
Ejemplo n.º 6
0
def main():
    import preproc_qqtopic
    import extract_keyword2
    import worddf
    dbfile = '../data/steady_test.db'
    logsteadyfile = '../result/steady_test.log'
    
    steadyfile = open(logsteadyfile,'w')
    steadyfile.write(out_result_header())
     
    if not os.path.exists(dbfile):
        preproc_qqtopic.load_topiclist(dbfile,'../data/topicgj')

        ke = extract_keyword2.DBKeywordExtractor(dbfile)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()


    cb = CommunityBuilder(dbfile)
    
    metrics = list()
    c = 50
    real = cmpcluster.load_doc_labels(dbfile)
    print 'steady_test'
    for i in range(c):
        print 'Time %d' % (i+1)
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted,real))
    
    mean,std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr =  '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs '+logsteadyfile)
Ejemplo n.º 7
0
def main():
    import pretext
    import extract_keyword2
    import worddf
    dbfile = '../data/sougou.db'
    logsteadyfile = '../result/sougou.log'

    steadyfile = open(logsteadyfile, 'w')
    steadyfile.write(out_result_header())

    if not os.path.exists(dbfile):
        pretext.load_topiclist(dbfile, '/home/cs/download/cluster_data/sougou')
        eva = WordWeightEvaluation(30, '../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile, eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()

    cb = CommunityBuilder(dbfile)

    metrics = list()
    c = 1
    real = cmpcluster.load_doc_labels(dbfile)
    print 'fudan'
    for i in range(c):
        print 'Time %d' % (i + 1)
        predicted = cb.build(max_depth=5, min_doc_num=20)
        metrics.append(cmp_cluster(predicted, real))

    mean, std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs ' + logsteadyfile)
Ejemplo n.º 8
0
def main():
    import preproc_qqtopic
    import extract_keyword2
    import worddf
    dbfile = '../data/steady_test.db'
    logsteadyfile = '../result/steady_test.log'

    steadyfile = open(logsteadyfile, 'w')
    steadyfile.write(out_result_header())

    if not os.path.exists(dbfile):
        preproc_qqtopic.load_topiclist(dbfile, '../data/topicgj')

        ke = extract_keyword2.DBKeywordExtractor(dbfile)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()

    cb = CommunityBuilder(dbfile)

    metrics = list()
    c = 50
    real = cmpcluster.load_doc_labels(dbfile)
    print 'steady_test'
    for i in range(c):
        print 'Time %d' % (i + 1)
        predicted = cb.build()
        metrics.append(cmp_cluster(predicted, real))

    mean, std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs ' + logsteadyfile)