def cluster_message(stop_words, user_dict, msg_fname, cluster_file, summary_file): # Init tokenizer jt = JiebaTokenizer(stop_words, user_dict, 'c') token_lines = token_message(jt, msg_fname) wdb = WordDictBuilder() wdb.add_tokens_list(token_lines) wdb.save('../data/word_dict.txt') keyword_dict = get_user_keywords(user_dict) cluser = Cluster(gl.gl_FUNCNUM) # Init feature_builder and simhash_builder fc = FeatureContainer(wdb.word_dict, keyword_dict) with open(msg_fname, 'r') as ins: for lineidx, line in enumerate(ins.readlines()): if (lineidx % 100 == 0): print lineidx (time, number, sender, message) = line.strip().split('|')[0:4] if (number == '10658368'): continue #替换数字、字母,截取第一句 short_msg = re.split(u'。'.encode('utf8'), message)[0] new_msg = re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', short_msg) #new_msg = re.split(u'。'.encode('utf8'), re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', message))[0] # Tokenize tokens = jt.tokens(new_msg.strip().decode('utf8')) feature_vec, sim_hash, min_hash = fc.compute_feature(tokens) cluser.add_one(min_hash, sim_hash, short_msg) cluser.save_cluster(cluster_file, summary_file) print "cluser finish"