def main():
    import preproc_qqtopic
    import extract_keyword2
    #import worddf
    dbfile = '../data/noise_test.db'
    lognoisefile = '../result/noise_test.log'
    
    noisefile = open(lognoisefile,'w')
    noisefile.write(out_result_header())
    rang = xrange(0,250, 60)
    for num in rang:
        dbfile = '../data/noise%d_test.db' % (num,)
        files = sample_docs(num)
        if os.path.exists(dbfile):
            os.remove(dbfile)

        dbcon = preproc_qqtopic.init_db(dbfile)
        preproc_qqtopic.load_topiclist(dbcon,'../data/topicgj')
        cnt = preproc_qqtopic.load_topic(dbcon,'noise_data',files)
        dbcon.close()
        print 'add number of noise document: %d' % cnt

        eva = WordWeightEvaluation(30, '../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile, eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()
 def __init__(self,db_path, weight_evaluator=None):
     self.dbpath = db_path
     self.dbcon = None
     if weight_evaluator:
         self.evaluator = weight_evaluator
     else:
         self.evaluator = WordWeightEvaluation(30,'../data')
         self.evaluator.add_docs_from_db(dbpath)
Exemple #3
0
def content_keyword(dbcon):
    print 'extracting keyword from content...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')
    wordset = load_wordset(dbcon)
    cnt = 0
    eluate = WordWeightEvaluation(30)
    for r in dbutils.iterRec(dbcon, 'document', 'docid title content'):
        word_weight_list = eluate.extract_kw(r[1], r[2])
        wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list])
        dbutils.updateByPK(dbcon, 'document', {'kw_content': wordwstr},
                           {'docid': r[0]})

        cnt += 1
        if cnt % 20 == 0:
            utils.updateProgress(cnt, doc_num)

    print ''
    eluate.close()
    dbcon.commit()
Exemple #4
0
def main():
    import pretext
    import extract_keyword2
    import worddf
    dbfile = '../data/sougou.db'
    logsteadyfile = '../result/sougou.log'

    steadyfile = open(logsteadyfile, 'w')
    steadyfile.write(out_result_header())

    if not os.path.exists(dbfile):
        pretext.load_topiclist(dbfile, '/home/cs/download/cluster_data/sougou')
        eva = WordWeightEvaluation(30, '../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile, eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()

    cb = CommunityBuilder(dbfile)

    metrics = list()
    c = 1
    real = cmpcluster.load_doc_labels(dbfile)
    print 'fudan'
    for i in range(c):
        print 'Time %d' % (i + 1)
        predicted = cb.build(max_depth=5, min_doc_num=20)
        metrics.append(cmp_cluster(predicted, real))

    mean, std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs ' + logsteadyfile)
Exemple #5
0
def topics_to_db(topics, dbfile):
    import sqlite3
    if os.path.exists(dbfile): os.remove(dbfile)
    dbcon = preproc_qqtopic.init_db(dbfile)
    for t in topics:
        idx = t.rindex('/')
        if idx == len(t) - 1:
            idx = t.rindex('/', 0, idx)
        tname = t[idx + 1:]
        files = os.listdir(t)
        filelist = [os.path.join(t, f) for f in files]

        preproc_qqtopic.load_topic(dbcon, tname, filelist)
    dbcon.close()

    evaluator = WordWeightEvaluation(30)
    ke = extract_keyword2.DBKeywordExtractor(dbfile, evaluator)
    ke.init_db()
    ke.content_keyword()
    ke.title_keyword()
    ke.topic_keyword()
    ke.close_db()

    return dbfile