Python JiebaTokenizer.JiebaTokenizerの例

プログラミング言語: Python

名前空間/パッケージ名: tokens

クラス/型: JiebaTokenizer

メソッド/関数: JiebaTokenizer

hotexamples.comのコード掲載数: 4

Python JiebaTokenizer.JiebaTokenizer - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtokens.JiebaTokenizer.JiebaTokenizerの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

JiebaTokenizer(4)

tokens(4)

よく使われるメソッド

JiebaTokenizer (4)

tokens (4)

コード例 #1

ファイルを表示

def cluster_message(stop_words, user_dict, msg_fname, cluster_file,
                    summary_file):
    # Init tokenizer
    jt = JiebaTokenizer(stop_words, user_dict, 'c')
    token_lines = token_message(jt, msg_fname)
    wdb = WordDictBuilder()
    wdb.add_tokens_list(token_lines)
    wdb.save('../data/word_dict.txt')
    keyword_dict = get_user_keywords(user_dict)

    cluser = Cluster(gl.gl_FUNCNUM)
    # Init feature_builder and simhash_builder
    fc = FeatureContainer(wdb.word_dict, keyword_dict)
    with open(msg_fname, 'r') as ins:
        for lineidx, line in enumerate(ins.readlines()):
            if (lineidx % 100 == 0):
                print lineidx
            (time, number, sender, message) = line.strip().split('|')[0:4]
            if (number == '10658368'):
                continue
            #替换数字、字母，截取第一句
            short_msg = re.split(u'。'.encode('utf8'), message)[0]
            new_msg = re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', short_msg)
            #new_msg = re.split(u'。'.encode('utf8'), re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', message))[0]

            # Tokenize
            tokens = jt.tokens(new_msg.strip().decode('utf8'))
            feature_vec, sim_hash, min_hash = fc.compute_feature(tokens)
            cluser.add_one(min_hash, sim_hash, short_msg)

    cluser.save_cluster(cluster_file, summary_file)
    print "cluser finish"

コード例 #2

ファイルを表示

def preProcessingData(filename):
    loadData(filename)
    jt_time = time.time()
    global jt
    jt = JiebaTokenizer(stopwords_path, 'c')
    end_jt_time = time.time()
    print('JiebaTokenizer time: %s' % str(end_jt_time - jt_time))
    # 根据所有的标注数据做词向量模型 生成词典
    wordList, wordDict = buildWords(jt, labelContents)
    end_build_time = time.time()
    print('buildWords time: %s' % str(end_build_time - end_jt_time))
    # 生成特征向量
    global fb
    fb = FeatureBuilder(wordDict)
    end_fb_build_time = time.time()
    print('FeatureBuilder time: %s' % str(end_fb_build_time - end_build_time))
    # 生成指纹
    global smb
    smb = SimhashBuilder(wordList)
    end_smb_build_time = time.time()
    print('SimhashBuilder time: %s' %
          str(end_smb_build_time - end_fb_build_time))
    # 生成所有标注数据的特征向量
    for flowId, processLabelDataMap in processFlowMap.items():
        processFlowMap[flowId] = generateDocFeatureVector(
            processLabelDataMap, jt, fb, smb)
    end_docFV_time = time.time()
    print('generateDocFeatureVector time: %s' %
          str(end_docFV_time - end_smb_build_time))

コード例 #3

ファイルを表示

    doc_1_noise,doc_path_1, doc_path_2, stopword_path, word_dict, mode, threshold = ('../lsh_data/doc_1.data',\
                                                                                    '../lsh_data/doc_1.clear',\
                                                                                    '../lsh_data/doc_2.data',\
                                                                                    '../lsh_data/stopwords.txt',\
                                                                                    '../lsh_data/word.dict',\
                                                                                    '-s',15)
    print 'Arguments get success:', sys.argv[1:]
    #原始query文档
    with open(doc_1_noise) as noise_file:
        doc_noise_file = noise_file.read().decode('utf8')
    #去噪后的query文档
    with open(doc_path_1) as ins:
        doc_data_1 = ins.read().decode('utf8')
    print 'Loaded', doc_path_1
    # 初始化分词器,主要是加载停用词
    jt = JiebaTokenizer(stopword_path, 'c')

    # 分词 tokens返回分词后的数组
    doc_token_1 = jt.tokens(doc_data_1)
    print 'Loading word dict...'
    # 加载字典并构建词典
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    # 构建非0特征向量
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(

コード例 #4

ファイルを表示

ファイル: launch.py プロジェクト: TPLink32/nlp

import sys
from tokens import JiebaTokenizer
from simhash_imp import SimhashBuilder, hamming_distance
from features import FeatureBuilder

if __name__ == "__main__":
    if len(sys.argv) < 7:
        print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path"
        exit(-1)
    # Load word list
    word_list = []
    with open(sys.argv[1], 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    # Init tokenizer
    jt = JiebaTokenizer(sys.argv[2], 'c')
    # Init feature_builder
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    fb = FeatureBuilder(word_dict)
    # Init simhash_builder
    smb = SimhashBuilder(word_list)
    # Load fingerprint list
    fingerprint_list = []
    with open(sys.argv[3], 'r') as ins:
        for line in ins.readlines():
            fingerprint_list.append(int(line))
    # For exp: load document content
    doc_list = []
    with open(sys.argv[4], 'r') as ins: