コード例 #1
0
def cluster_message(stop_words, user_dict, msg_fname, cluster_file,
                    summary_file):
    # Init tokenizer
    jt = JiebaTokenizer(stop_words, user_dict, 'c')
    token_lines = token_message(jt, msg_fname)
    wdb = WordDictBuilder()
    wdb.add_tokens_list(token_lines)
    wdb.save('../data/word_dict.txt')
    keyword_dict = get_user_keywords(user_dict)

    cluser = Cluster(gl.gl_FUNCNUM)
    # Init feature_builder and simhash_builder
    fc = FeatureContainer(wdb.word_dict, keyword_dict)
    with open(msg_fname, 'r') as ins:
        for lineidx, line in enumerate(ins.readlines()):
            if (lineidx % 100 == 0):
                print lineidx
            (time, number, sender, message) = line.strip().split('|')[0:4]
            if (number == '10658368'):
                continue
            #替换数字、字母,截取第一句
            short_msg = re.split(u'。'.encode('utf8'), message)[0]
            new_msg = re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', short_msg)
            #new_msg = re.split(u'。'.encode('utf8'), re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', message))[0]

            # Tokenize
            tokens = jt.tokens(new_msg.strip().decode('utf8'))
            feature_vec, sim_hash, min_hash = fc.compute_feature(tokens)
            cluser.add_one(min_hash, sim_hash, short_msg)

    cluser.save_cluster(cluster_file, summary_file)
    print "cluser finish"
コード例 #2
0
"""
import os
import sys
import time
from tokens import JiebaTokenizer
from DictBuilder import WordDictBuilder

if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "Usage:\tpreprocess.py <docpath> <stopword_path> <worddict_path>"
        exit(-1)
    doc_path, stopword_path, worddict_path = sys.argv[1:]
    print "Arguments:", sys.argv[1:]

    # Init tokenizer
    jt = JiebaTokenizer(stopword_path, "c")
    # Load doc data
    with open(doc_path) as ins:
        doc_data = ins.read().decode("utf8")
    # Tokenization
    doc_tokens = jt.tokens(doc_data)
    # Write to token file
    with open(doc_path[: doc_path.rfind(".")] + ".token", "w") as outs:
        outs.write("/".join([token.encode("utf8") for token in doc_tokens]))

    # Load original word dict, update and save
    wdb = WordDictBuilder(worddict_path, tokenlist=doc_tokens)
    wdb.run()
    wdb.save(worddict_path)
    print "Totally", len(wdb.word_dict), "words"
コード例 #3
0
                                                                                    '../lsh_data/stopwords.txt',\
                                                                                    '../lsh_data/word.dict',\
                                                                                    '-s',15)
    print 'Arguments get success:', sys.argv[1:]
    #原始query文档
    with open(doc_1_noise) as noise_file:
        doc_noise_file = noise_file.read().decode('utf8')
    #去噪后的query文档
    with open(doc_path_1) as ins:
        doc_data_1 = ins.read().decode('utf8')
    print 'Loaded', doc_path_1
    # 初始化分词器,主要是加载停用词
    jt = JiebaTokenizer(stopword_path, 'c')

    # 分词 tokens返回分词后的数组
    doc_token_1 = jt.tokens(doc_data_1)
    print 'Loading word dict...'
    # 加载字典并构建词典
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    # 构建非0特征向量
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(
        doc_token_1
    )  # return feature_nonzero得到一个非0 长度的向量 ,元素为(idx,value)且 value > 0
    #使得字典中的每个值都有一个hash值,
コード例 #4
0
ファイル: launch.py プロジェクト: RianaChen/text-similarity
     for line in ins.readlines():
         fingerprint_list.append(int(line))
 # For exp: load document content
 doc_list = []
 with open(sys.argv[4], 'r') as ins:
     for line in ins.readlines():
         doc_list.append(line.strip())
 # Detection process begins
 min_sim = 64
 min_docid = 0
 with open(sys.argv[5], 'r') as ins:
     for lineidx, line in enumerate(ins.readlines()):
         if lineidx != 642:
             continue
         # Tokenize
         tokens = jt.tokens(line.strip().decode('utf8'))
         # Compute text feature
         feature = fb.compute(tokens)
         # Compute simhash
         fingerprint = smb.sim_hash(feature)
         result_list = []
         for idx, fp in enumerate(fingerprint_list):
             sim = hamming_distance(fingerprint, fp, 64)
             result_list.append((sim, idx))
         result_list = sorted(result_list, cmp=lambda x,y: cmp(x[0],y[0]))
         if result_list[0][0] < min_sim:
             min_sim, min_docid = result_list[0][0], lineidx
         #'''
         with open(sys.argv[6], 'w') as outs:
             outs.write(line.strip()+os.linesep)
             for sim, idx in result_list:
コード例 #5
0
ファイル: launch.py プロジェクト: TPLink32/nlp
     for line in ins.readlines():
         fingerprint_list.append(int(line))
 # For exp: load document content
 doc_list = []
 with open(sys.argv[4], 'r') as ins:
     for line in ins.readlines():
         doc_list.append(line.strip())
 # Detection process begins
 min_sim = 64
 min_docid = 0
 with open(sys.argv[5], 'r') as ins:
     for lineidx, line in enumerate(ins.readlines()):
         if lineidx != 642:
             continue
         # Tokenize
         tokens = jt.tokens(line.strip().decode('utf8'))
         # Compute text feature
         feature = fb.compute(tokens)
         # Compute simhash
         fingerprint = smb.sim_hash(feature)
         result_list = []
         for idx, fp in enumerate(fingerprint_list):
             sim = hamming_distance(fingerprint, fp, 64)
             result_list.append((sim, idx))
         result_list = sorted(result_list, cmp=lambda x, y: cmp(x[0], y[0]))
         if result_list[0][0] < min_sim:
             min_sim, min_docid = result_list[0][0], lineidx
         #'''
         with open(sys.argv[6], 'w') as outs:
             outs.write(line.strip() + os.linesep)
             for sim, idx in result_list:
コード例 #6
0
import os
import sys
import time
from tokens import JiebaTokenizer
from DictBuilder import WordDictBuilder

if __name__=="__main__":
    if len(sys.argv) < 4:
        print "Usage:\tpreprocess.py <docpath> <stopword_path> <worddict_path>"
        exit(-1)
    doc_path, stopword_path, worddict_path = sys.argv[1:]
    print 'Arguments:',sys.argv[1:]
    
    # Init tokenizer
    jt = JiebaTokenizer(stopword_path, 'c')
    # Load doc data
    with open(doc_path) as ins:
        doc_data = ins.read().decode('utf8')
    # Tokenization
    doc_tokens = jt.tokens(doc_data)
    # Write to token file
    with open(doc_path[:doc_path.rfind('.')]+'.token', 'w') as outs:
        outs.write('/'.join([token.encode('utf8') for token in doc_tokens]))
    
    # Load original word dict, update and save
    wdb = WordDictBuilder(worddict_path, tokenlist=doc_tokens)
    wdb.run()
    wdb.save(worddict_path)
    print 'Totally', len(wdb.word_dict), 'words'
    
コード例 #7
0
 fingerprint_list = []
 with open(sys.argv[3], "r") as ins:
     for line in ins.readlines():
         fingerprint_list.append(int(line))
 # For exp: load document content
 doc_list = []
 with open(sys.argv[4], "r") as ins:
     for line in ins.readlines():
         doc_list.append(line.strip())
 # Detection process begins
 min_sim = 64
 min_docid = 0
 with open(sys.argv[5], "r") as ins:
     for lineidx, line in enumerate(ins.readlines()):
         # Tokenize
         tokens = jt.tokens(line.strip().decode("utf8"))
         feature, fingerprint = fc.compute_feature(tokens)
         result_list = []
         for idx, fp in enumerate(fingerprint_list):
             sim = hamming_distance(fingerprint, fp, 64)
             result_list.append((sim, idx))
         result_list = sorted(result_list, cmp=lambda x, y: cmp(x[0], y[0]))
         if result_list[0][0] < min_sim:
             min_sim, min_docid = result_list[0][0], lineidx
         #'''
         with open(sys.argv[6], "w") as outs:
             outs.write(line.strip() + os.linesep)
             for sim, idx in result_list:
                 outs.write("%s\t%s%s" % (sim, doc_list[idx], os.linesep))
         #'''
         # if lineidx == 2:
コード例 #8
0
        print "Usage:\tisSimilar.py <doc1> <doc2> <stopword_path> <word_dict> <-c/-s> <threshold>"
        exit(-1)
    doc_path_1, doc_path_2, stopword_path, word_dict, mode, threshold = sys.argv[1:]
    print 'Arguments:', sys.argv[1:]
    with open(doc_path_1) as ins:
        doc_data_1 = ins.read().decode('utf8')
        print 'Loaded', doc_path_1
    with open(doc_path_2) as ins:
        doc_data_2 = ins.read().decode('utf8')
        print 'Loaded', doc_path_2

    # Init tokenizer
    jt = JiebaTokenizer(stopword_path, 'c')

    # Tokenization
    doc_token_1 = jt.tokens(doc_data_1)
    doc_token_2 = jt.tokens(doc_data_2)

    print 'Loading word dict...'
    # Load word list from word_dict
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])

    # Build unicode string word dict
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
        # Build nonzero-feature
    fb = FeatureBuilder(word_dict)