コード例 #1
0
def cluster_message(stop_words, user_dict, msg_fname, cluster_file,
                    summary_file):
    # Init tokenizer
    jt = JiebaTokenizer(stop_words, user_dict, 'c')
    token_lines = token_message(jt, msg_fname)
    wdb = WordDictBuilder()
    wdb.add_tokens_list(token_lines)
    wdb.save('../data/word_dict.txt')
    keyword_dict = get_user_keywords(user_dict)

    cluser = Cluster(gl.gl_FUNCNUM)
    # Init feature_builder and simhash_builder
    fc = FeatureContainer(wdb.word_dict, keyword_dict)
    with open(msg_fname, 'r') as ins:
        for lineidx, line in enumerate(ins.readlines()):
            if (lineidx % 100 == 0):
                print lineidx
            (time, number, sender, message) = line.strip().split('|')[0:4]
            if (number == '10658368'):
                continue
            #替换数字、字母,截取第一句
            short_msg = re.split(u'。'.encode('utf8'), message)[0]
            new_msg = re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', short_msg)
            #new_msg = re.split(u'。'.encode('utf8'), re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', message))[0]

            # Tokenize
            tokens = jt.tokens(new_msg.strip().decode('utf8'))
            feature_vec, sim_hash, min_hash = fc.compute_feature(tokens)
            cluser.add_one(min_hash, sim_hash, short_msg)

    cluser.save_cluster(cluster_file, summary_file)
    print "cluser finish"