Python tokenize Beispiele, jieba.tokenize Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: masking.py Projekt: cybo1112/cantoformer

        def worker(item):
            size = 0
            ids, masked_ids = [], []
            index = item[1]
            item = item[0]

            min_index = item.offsets[1][
                0]  #get original index of first word in encoded segment
            max_index = max(item.offsets)[
                1]  #get original index of last word in encoded segment
            words = list(jieba.tokenize(data[index][min_index:max_index]))
            arr = np.array(item.ids, dtype=np.int32)
            if (np.count_nonzero(arr) > 10):
                masked_id = mask_ids(item, words)
                if masked_id is not None:
                    ids.append(arr)
                    masked_ids.append(np.array(masked_id, dtype=np.int32))
                size += 1

            for overflowing in item.overflowing:
                min_index = overflowing.offsets[1][0]
                max_index = max(overflowing.offsets)[1]
                words = list(jieba.tokenize(data[index][min_index:max_index]))
                arr = np.array(overflowing.ids, dtype=np.int32)
                if (np.count_nonzero(arr) > 10):
                    masked_id = mask_ids(overflowing, words)
                    if masked_id is not None:
                        ids.append(arr)
                        masked_ids.append(np.array(masked_id, dtype=np.int32))
                    size += 1

            return ids, masked_ids, size

Beispiel #2

0

Datei anzeigen

Datei: demo.py Projekt: aaaasule/zndhjqr

def word_process(text):
    """

    :param text:
    :return:
    """
    result = []
    print(jieba.tokenize(text))
    for (word, start, end) in jieba.tokenize(text):
        pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
        result.append((pseg_data, start, end))

    # result = word_process('我明天去吃饭')
    print(result)
    raw_entities = []
    for (item_posseg, start, end) in result:
        part_of_speech = ["nr", "ns", "nt", "t"]
        for (word_posseg, flag_posseg) in item_posseg:
            print(word_posseg)
            print(flag_posseg)
            if flag_posseg in part_of_speech:
                raw_entities.append({
                    'start': start,
                    'end': end,
                    'value': word_posseg,
                    'entity': flag_posseg
                })
    print(raw_entities)

Beispiel #3

0

Datei anzeigen

Datei: output.py Projekt: jacksukk/ML2017Fall_Final_QA

def getTest_feature(test_data):
    paragraphs = []
    questions = []
    test_ids = []
    # store each data row temporally
    tmp_x_row = []

    #get data position
    subjects = test_data['data']

    for subject in subjects:
        # subject contains title and *paragraphs*

        for paragraph in subject['paragraphs']:
            # paragraphs contains *context* and *qas*
            context = list(
                jieba.tokenize(paragraph['context'].replace("\n", "")))

            for qa in paragraph['qas']:

                ######################################
                paragraphs.append(context)
                questions.append(list(jieba.tokenize(qa['question'])))
                #######################################
                test_ids.append(
                    qa['id']
                )  # append question:string to tmp_x_row (behind context:string)

                #check if every question have unique answer
    return paragraphs, questions, test_ids

Beispiel #4

0

Datei anzeigen

Datei: 初识文本挖掘.py Projekt: wangcunjiang/PythonStudy

def analyse_fenci():
    # 2.4 分词分析:进一步,我们需要对文本信息进行相关分析，如返回词语所在位置、返回关键词等等。
    # 2.4.1返回词语所在位置
    import jieba.analyse
    print("1.采取精准模式结果：")
    # print([item for item in jieba.tokenize(u"数据分析与数据挖掘的应用")])
    for item in jieba.tokenize(u"数据分析与数据挖掘的应用"):
        print item[0], item[1], item[2]
    print("-------------------")
    print("2.采取搜索模式结果：")
    # print([item for item in jieba.tokenize("数据分析与数据挖掘的应用", mode="search")])
    for item in jieba.tokenize(u"数据分析与数据挖掘的应用", mode="search"):
        print item[0], item[1], item[2]

    # 2.4.2提取文本中的关键词
    print '提取文本中的关键词:'  #其结果是结合文中出现的词频与字典中的词频进行排序
    import jieba.analyse
    # print(jieba.analyse.extract_tags("我喜欢广州小蛮腰", 3))
    # print(jieba.analyse.extract_tags("我喜欢广州广州小蛮腰", 3))
    # print(jieba.analyse.extract_tags("我喜欢广州广州广州小蛮腰", 3))
    for item in jieba.analyse.extract_tags("我喜欢广州小蛮腰", 3):
        print item + ' ',
    print ''
    for item in jieba.analyse.extract_tags("我喜欢广州广州小蛮腰", 3):
        print item + ' ',
    print ''
    for item in jieba.analyse.extract_tags("我喜欢广州广州广州小蛮腰", 3):
        print item + ' ',
    print ''

Beispiel #5

0

Datei anzeigen

Datei: regular.py Projekt: Aeizzz/NLP

def fun5():
    print("默认的tokenize")
    result = jieba.tokenize(u"自然语言处理非常有用")
    for tk in result:
        print('%s\t\t atart: %d \t\t end:%d' % (tk[0], tk[1], tk[2]))

    print("\n------------分割线-------------\n")
    print("搜索模式的tokenize")
    result = jieba.tokenize(u"自然语言处理非常有用", mode='search')
    for tk in result:
        print('%s\t\t atart: %d \t\t end:%d' % (tk[0], tk[1], tk[2]))

Beispiel #6

0

Datei anzeigen

Datei: raw_data_process.py Projekt: Yunnglin/GeoQA

 def _get_same_words_with_cut(self, source: str, target: str):
     """
     使用结巴分词来抽取相同词
     """
     res_words: [Word] = []
     source_cut = [Word(*word) for word in jieba.tokenize(source)]
     target_cut = [Word(*word) for word in jieba.tokenize(target)]
     for word in source_cut:
         if word in target_cut and word.text not in STOPWORDS and len(word.text) >= self.least_word_len:
             res_words.append(word)
     return res_words

Beispiel #7

0

Datei anzeigen

Datei: custom_tokenizer.py Projekt: lin17182210/FAQ-Assistant

    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import jieba

        text = message.get(attribute)
        if self.component_config.get("case_sensitive", False):
            tokenized = jieba.tokenize(text.lower())
        else:
            tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return self._apply_token_pattern(tokens)

Beispiel #8

0

Datei anzeigen

Datei: common_jieba.py Projekt: mumupy/mmdeeplearning

def tokenize():
    """
    分词
    :return:
    """
    s = "周大福是创新办主任也是云计算方面的专家"
    result = jieba.tokenize(s)
    logger.info("普通模式")
    for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))

    logger.info("\n搜索模式")
    result = jieba.tokenize(s, mode='search')
    for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))

Beispiel #9

0

Datei anzeigen

Datei: gov_host.py Projekt: AHLinJie/gadmin

def jieba_fenci_for_crawl_doc(doc):
    """结巴分词
    """
    for lib in JIEBA_CUSTOM_LIBS:
        prodict = os.path.join(settings.STATICFILES_DIRS[0], 'jiebadic', lib[0])
        try:
            jieba.load_userdict(prodict)
        except IOError:
            continue

    rs = ['\xa0',
          '一、',
          '二、',
          '三、',
          '四、',
          '五、',
          '六、',
          '七、',
          '八、',
          '九、',
          '十、']
    for r in rs:
        doc = doc.replace(r, '')
    regex = re.compile(r'[\n\r\t,.:\-"；（）。、：，的<>》《()]')  # 去除换行 回车 制表符 中文标点符号
    t = regex.sub("", doc)
    fenci_data = jieba.tokenize(t)  # 结巴分词
    return fenci_data

Beispiel #10

0

Datei anzeigen

Datei: categorizing.py Projekt: wzuden/nlp-chinese_text_classification

def getChList(docStrByte):
    ## 传入一个文档的二进制代码，返回中文分词后的结果，用空格把中文分词的词

    inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#二进制转为字符串，英文字母转为小写
    strList = ''.join(inputStr.split('\n'))#删去换行符，连接每行成为一个段落
    rawTokens = list(jieba.tokenize(strList))#中文分词

    #stopWord 是 一个字典，每个key 是一个停用词，value都是None
    fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read()
    stopWord = {}.fromkeys(fSW.split('\n'))
    stopWord[''] = None

    final = []
    s = nltk.stem.SnowballStemmer('english')
    for seg in rawTokens:
        # print(seg[0].strip())
        rawWord = seg[0].strip()#strip()函数，去除字符串前后的空格
        if (rawWord.isalpha()):#如果是英文单词，则提取词干
            word = s.stem(rawWord)
        else:
            word = rawWord

        if  word not in stopWord:#去除停用词
            final.append(word)#最后返回list
    return final

Beispiel #11

0

Datei anzeigen

Datei: text_mining.py Projekt: Reid00/data_mining

def high_freq_words():
    sentence = '我喜欢苏州的苏州中心,上海,上海的东方明珠'
    words = jieba.analyse.extract_tags(sentence, topK=3)
    print(f'top 3 的词语 {words}')
    # 返回词语的位置
    words_loc = jieba.tokenize(sentence)
    print(f'各个词语的位置{list(words_loc)}')

Beispiel #12

0

Datei anzeigen

    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens

Beispiel #13

0

Datei anzeigen

Datei: tor_parser.py Projekt: Shu-Ji/dht

    def __call__(self, text, **kargs):
        token  = Token()

        words = set()
        words_list = []

        for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'):
            i = i.strip()
            if not i:
                continue
            if i in words:
                continue
            if i in punct:
                continue
            words.add(i)
            words_list.append(i)

        for w in words:
            if not accepted_chars.match(w):
                if len(w) <= 1:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Beispiel #14

0

Datei anzeigen

Datei: entity_rec.py Projekt: yugijimoh/biglab_django

def entity_rec(request):
    req = request.body
    return req
    print(req)
    entity_d = {
        'person': [],
        'fund': [],
        'company': [],
        'industry': [],
        'stock': []
    }
    index_l = [0 for i in range(len(news))]
    result = jieba.tokenize(news)
    start = time.time()
    for k in result:
        if k[0] in person_list:
            entity_d['person'] = entity_d['person'] + [k[0]]
            index_l[k[1]:k[2]] = [1 for k in range(k[2] - k[1])]
        if k[0] in fund_list:
            entity_d['fund'] = entity_d['fund'] + [k[0]]
            index_l[k[1]:k[2]] = [2 for k in range(k[2] - k[1])]
        if k[0] in company_list:
            entity_d['company'] = entity_d['company'] + [k[0]]
            index_l[k[1]:k[2]] = [3 for k in range(k[2] - k[1])]
        if k[0] in industry_list:
            entity_d['industry'] = entity_d['industry'] + [k[0]]
            index_l[k[1]:k[2]] = [4 for k in range(k[2] - k[1])]
        if k[0] in stock_list:
            entity_d['stock'] = entity_d['stock'] + [k[0]]
            index_l[k[1]:k[2]] = [5 for k in range(k[2] - k[1])]
    print("--- %s seconds ---" % (time.time() - start))
    print(json.dumps({'entity_d': entity_d, 'index_l': index_l}))
    return json.dumps({'entity_d': entity_d, 'index_l': index_l})

Beispiel #15

0

Datei anzeigen

Datei: jieba.py Projekt: nkolmar/inception-external-recommender

 def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
             document_id: str, user_id: str):
     result = jieba.tokenize(cas.sofa_string)
     for tk in result:
         prediction = self.create_prediction(cas, layer, feature, tk[1],
                                             tk[2], tk[0])
         cas.add_annotation(prediction)

Beispiel #16

0

Datei anzeigen

Datei: zhihu_stats_server.py Projekt: lukedan/zhihu_stats

def summarize(text, cut_search, window=100):
    content = get_content(doc.get('path'))
    tokres = jieba.tokenize(content, mode='search')
    search_words = {}
    for i in range(len(cut_search)):
        search_words[cut_search[i]] = i
    kaps = []
    for x in tokres:
        if x[0] in search_words.keys():
            kaps.append((x[1], x[2], search_words[x[0]]))
    kaps.sort(key=(lambda x: x[0]))
    nextitem = 0
    maxv, s, e = 0, 0, 0
    for i in range(len(kaps)):
        end = kaps[i][0] + window
        while nextitem < len(kaps) and kaps[nextitem][1] <= end:
            nextitem += 1
        exc, rni = nextitem - i, nextitem
        while rni < len(kaps) and kaps[rni][0] < end:
            if kaps[rni][1] <= end:
                exc += 1
            rni += 1
        if exc > maxv:
            maxv, s, e = exc, i, rni
    lens = kaps[s][0]
    kaps = kaps[s:e]
    maxk = max((x[1] for x in kaps if x[1] <= lens + window))
    lens -= (lens + window - maxk) / 2
    if lens + window > len(content):
        lens = len(content) - window
    if lens < 0:
        lens = 0
    return maxv, lens, len(content), content[lens:lens + window], kaps

Beispiel #17

0

Datei anzeigen

Datei: db.py Projekt: rememberberry/rememberberry-anki

def _load_cedict(filename, hsk=None):
    cedict = defaultdict(list)
    with open(filename, 'r', encoding="utf-8") as f:
        for line in f:
            if line.startswith('#'):
                continue
            tr, sm, py, transl = re.match(r"(\S*) (\S*) \[(.*)\] \/(.*)\/",
                                          line).groups()
            transl = transl.split('/')
            transl = '/'.join([
                t for t in transl if not t.startswith('see also ')
                and not t.startswith('variant of')
            ])
            cedict[sm].append((tr, py, transl))

    # Find compounds with jieba
    num = 0
    compound_parts = {}
    for sm, entries in cedict.items():
        # search mode will produce compounds and their parts
        tokens = list(jieba.tokenize(sm, mode='search'))
        parts = [t for t in tokens if t[2] - t[1] < len(sm)]
        compound_parts[sm] = parts

    # Join multiple sound characters (多音字)
    cedict = {
        sm: (sm, entries, compound_parts[sm])
        for sm, entries in cedict.items()
    }
    return cedict

Beispiel #18

0

Datei anzeigen

Datei: data_preprocess.py Projekt: wangbq18/CAIL-CCL-2019

def build_word_level_vocabulary_all(train_file,
                                    valid_file=None,
                                    test_file=None):
    sentences = list()

    with codecs.open(train_file, encoding='utf-8') as f_train:
        for line in f_train:
            x = json.loads(line)
            sentences.extend([x['A'].strip(), x['B'].strip(), x['C'].strip()])
    if valid_file:
        with codecs.open(valid_file, encoding='utf-8') as f_valid:
            for line in f_valid:
                x = json.loads(line)
                sentences.extend(
                    [x['A'].strip(), x['B'].strip(), x['C'].strip()])
    if test_file:
        with codecs.open(test_file, encoding='utf-8') as f_test:
            for line in f_test:
                x = json.loads(line)
                sentences.extend(
                    [x['A'].strip(), x['B'].strip(), x['C'].strip()])
    corpus = u''.join(sentences)
    word_list = list(set([tk[0] for tk in jieba.tokenize(corpus)]))

    return dict((word, idx + 1) for idx, word in enumerate(word_list))

Beispiel #19

0

Datei anzeigen

def tokenize(sentence, mode='default'):
    """
    切词并返回切词位置
    :param sentence:
    :return: (word, start_index, end_index) model='search'
    """
    return list(jieba.tokenize(sentence, mode=mode))

Beispiel #20

0

Datei anzeigen

    def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:
        import jieba

        text = self.preprocess_text(text, attribute)
        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens

Beispiel #21

0

Datei anzeigen

Datei: util.py Projekt: shihuaxing/corrector

def tokenize(sentence):
    """
    切词
    :param sentence:
    :return: (word, start_index, end_index) model='search'
    """
    return list(jieba.tokenize(sentence, mode='search'))

Beispiel #22

0

Datei anzeigen

 def split(self, input_s):
     self.s = input_s
     self.token = jieba.tokenize(self.s)
     num_en = 0
     num_zh = 0
     for t in self.token:
         if not t[0].isspace():
             if t[0] in ',，"\'‘’“”#@%<>《》{}【】[]。，！!?？':
                 self.symbol.append(t)
             else:
                 lang = langid.classify(t[0])[0]
                 if lang == "en":
                     self.english.append(t)
                     num_en += 1
                 elif lang == "zh":
                     self.chinese.append(t)
                     num_zh += 1
                 else:
                     self.other.append(t)
     if num_en == 1 and num_zh == 1:
         code_mix = 1
     if num_en == 0 and num_zh == 0:
         self.note = "other"
     elif num_en > num_zh:
         self.note = "en"
         self.translate_en_zh()
     else:
         self.note = "zh"
         self.translate_zh_en()

Beispiel #23

0

Datei anzeigen

def correct(ss):
    '''
    Correct sentence ss
    '''
    # Returns list of tuples (word, st, en)  mode='search'
    tokens = list(jieba.tokenize(ss))
    print('Segmented sentence is {}'.format(''.join(
        [str(token) for token in tokens])))

    segranges = [[token[1], token[2]] for token in tokens]
    _, _, outranges = score_sentence(ss)
    if outranges:
        cranges = merge_ranges(get_overlap_ranges(outranges, segranges))
        for crange in cranges:
            print('Correct range is {}'.format(crange))
            st, en = crange
            print('Possible wrong segment is {}'.format(ss[st:en]))
            pwrong = ss[st:en]
            # seg_list = jieba.cut(pwrong)
            # error_string = ", ".join(seg_list)
            # errors = error_string.split(", ")
            # cgram = ""
            # for error in errors:
            cgram = auto_correct(pwrong, cn_dict, word_freq)
            ss = ss[:st] + cgram + ss[en:]
            print('Corrected pinyin is {}'.format(cgram))

            cgram2 = correct_ngram_2(ss, st, en)
            print('Corrected ngram is {}'.format(cgram2))
            ss = ss[:st] + cgram2 + ss[en:]
    else:
        cranges = []
        print('No segment to correct.')
    return ss, cranges

Beispiel #24

0

Datei anzeigen

Datei: test_tokenize.py Projekt: lfol/jieba

def cuttest(test_sent):
    global g_mode
    for n in re_num.finditer(test_sent):
        print(n.start(), n.end(), n.group())
    result = jieba.tokenize(test_sent, mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))

Beispiel #25

0

Datei anzeigen

Datei: cv_tfidf_word2vec.py Projekt: lienzhen/snippet_new

def get_words_jd(input_file):
    max_length_words = 100
    length = 0
    jd_str =""
    jd_position = {}
    jd_words =[]
    with open(input_file) as f:
        lines = f.readlines()
        for line in lines:
            if not line:continue
            line = line.strip()
            line = re.sub(r"\s+","",line)
            line = line.decode('utf-8')
            for word in line:
                jd_str += word.encode('utf-8')
                length += 1
                if length >= max_length_words:break

    result = jieba.tokenize(jd_str.decode('utf-8'))
    for tk in result:
        if tk[0].encode('utf-8') not in stop_words and tk[0].encode('utf-8') not in stop:
            jd_words.append(tk[0].encode('utf-8'))
            if not jd_position.has_key(tk[0].encode('utf-8')):
               jd_position[tk[0].encode('utf_8')] = {"start_pos" : tk[1], "end_pos" : tk[2]}
    return jd_str, jd_words, jd_position

Beispiel #26

0

Datei anzeigen

def test_tokenize():
    """
    测试token。
    :return:
    """
    # 生成token串
    result = jieba.tokenize("永和服装饰品有限公司")
    for tk in result:
        common_logger.info("word {0}\t\t start:{1}\t\t end:{2}".format(
            tk[0], tk[1], tk[2]))

    # 搜索模式的token串
    result = jieba.tokenize("永和服装饰品有限公司", mode="search")
    for tk in result:
        common_logger.info("word {0}\t\t start:{1}\t\t end:{2}".format(
            tk[0], tk[1], tk[2]))

Beispiel #27

0

Datei anzeigen

 def jieba_split(
         self, i: int,
         normalized_string: NormalizedString) -> List[NormalizedString]:
     splits = []
     for token, start, stop in jieba.tokenize(str(normalized_string)):
         splits.append(normalized_string[start:stop])
     return splits

Beispiel #28

0

Datei anzeigen

def test_tfidf():
    lines = open('D:\\Python\\Data\\NBA.txt', encoding='utf-8').read()
    print(type(lines))
    # 基于TF-IDF算法的关键词抽取
    words = analyse.extract_tags(lines, topK=20, withWeight=True, allowPOS=())
    print(words)

    # 基于TextRank算法的关键词抽取
    words = analyse.textrank(lines,
                             topK=20,
                             withWeight=False,
                             allowPOS=('ns', 'n', 'vn', 'v'))
    print(words)
    words = analyse.textrank(lines,
                             topK=20,
                             withWeight=False,
                             allowPOS=('ns', 'n'))
    print(words)

    # 词性标注
    words = pseg.cut('我爱自然语言处理')
    # print(list(words))
    for word, flag in words:
        print(word, flag)

    # Tokenize：返回词语在原文的起止位置
    result = jieba.tokenize('我爱自然语言处理')
    print(list(result))

Beispiel #29

0

Datei anzeigen

def ann_rebuild(filename):
    jieba.load_userdict("jiebadic.txt")
    rf = codecs.open(filename, encoding='utf-8')
    annotation = {}
    for line in rf:
        if line.startswith("T"):
            word = line.strip().split('\t')[-1]
            type_offset = line.strip().split('\t')[1].split(' ')
            type = type_offset[0]
            start = int(type_offset[1])
            final_end = int(type_offset[-1])
            result = jieba.tokenize(word)
            for i, tk in enumerate(result):
                end = start + tk[2] - tk[1]
                if i == 0:
                    type0 = "B-" + type
                    annotation[(start, end)] = type0
                elif end == final_end:
                    type1 = "E-" + type
                    annotation[(start, end)] = type1
                else:
                    type2 = "I-" + type
                    annotation[(start, end)] = type2
                start = end
    rf.close()
    return annotation

Beispiel #30

0

Datei anzeigen

def tokenizer(filename):
    word_counter = collections.defaultdict(int)
    with open(filename) as f:
        for line in f:
            for word in jieba.tokenize(line.decode('utf-8')):
                word_counter[word[0]] += 1
    return word_counter

Beispiel #31

0

Datei anzeigen

Datei: test_part_deny.py Projekt: Tju-AI/Semi-interactive-Attention-Network-for-Answer-Understanding-in-Reverse-QA

def feature_embeding(comment):
    size = 15
    par = 1
    data = pd.read_excel('lstm_data/feature_word.xlsx', index=None)
    definite_words = list(data['肯定词'])
    positive_words = list(data['正向'])
    negative_words = list(data['负向'])
    imagine_words = list(data['假想词'])
    deny_words = list(data['否定词'])
    inter_words = list(data['疑问词'])
    assume_words = list(data['假定词'])
    feature_embed = np.zeros((len(comment), maxlen_context, 1 * size))
    for i, t in enumerate(comment):
        token = jieba.tokenize(t[:maxlen_context])
        for tk in token:
            if tk[0] in deny_words:
                feature_embed[i, tk[1]:tk[2], 0:size] = par
            if tk[0] in inter_words:
                feature_embed[i, tk[1]:tk[2], size:2 * size] = par
            if tk[0] in assume_words:
                feature_embed[i, tk[1]:tk[2], 2 * size:3 * size] = par
            if tk[0] in definite_words:
                feature_embed[i, tk[1]:tk[2], 3 * size:4 * size] = par
            if tk[0] in positive_words:
                feature_embed[i, tk[1]:tk[2], 4 * size:5 * size] = par
            if tk[0] in negative_words:
                feature_embed[i, tk[1]:tk[2], 5 * size:6 * size] = par
            if tk[0] in imagine_words:
                feature_embed[i, tk[1]:tk[2], 6 * size:7 * size] = par
    return feature_embed

Beispiel #32

0

Datei anzeigen

def getChList(docStrByte):
    inputStr = str(docStrByte, encoding='gbk', errors='ignore')
    ## filter the first several sentence
    strList = list(i for i in inputStr.split('\n'))
    # print(strList)
    startLine = 0
    for i in range(len(strList)):
        if (strList[i].startswith('【')):
            startLine += 1
        else:
            break
    # print(strList[startLine:])

    rawTokens = list(jieba.tokenize(''.join(strList[startLine:])))
    # stopWord = {}.fromkeys([line for line in open('stopwords.txt','r',encoding = 'gbk', errors = 'ignore')])
    fSW = open('stopwords.txt', 'r', encoding='utf-8', errors='ignore').read()
    # print(fSW.split('\n')[:99])
    stopWord = {}.fromkeys(fSW.split('\n'))

    # print(stopWord)
    stopWord[''] = None
    # for (k,v) in stopWord.items():
    #     print(k, ',', v)
    final = ''
    for seg in rawTokens:
        # print(seg)
        # seg.encode('gbk')
        word = seg[0].strip()
        if word not in stopWord:
            final += (' ' + word)  #if using final is not good
        # else:
        # print(seg)
    # print(type(final))
    return final

Beispiel #33

0

Datei anzeigen

Datei: event_extract_package.py Projekt: jeffrey82221/WIR_FINAL

def tokenize(sentence,addwords=None):
    if(addwords!=None):
        for word in addwords:
            jieba.add_word(word)
    tokens = []
    for term in jieba.tokenize(sentence):
        tokens.append(term[0])
    return tokens

Beispiel #34

0

Datei anzeigen

Datei: jieba_test.py Projekt: gumblex/jieba

 def testTokenize_NOHMM(self):
     for content in test_contents:
         result = jieba.tokenize(content,HMM=False)
         assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
         result = list(result)
         assert isinstance(result, list), "Test Tokenize error on content: %s" % content
         for tk in result:
             print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
     print("testTokenize_NOHMM", file=sys.stderr)

Beispiel #35

0

Datei anzeigen

Datei: word_segment.py Projekt: Datartisan/code-samples

def segment(raw_text):

    tokens = jieba.tokenize(raw_text)
    seg_list = [w for (w, start_pos, stop_pos) in tokens if token_condition(w)]

    seg_freq_counter = Counter(seg_list)
    seg_freq = dict(seg_freq_counter)

    return json.dumps(seg_freq)

Beispiel #36

0

Datei anzeigen

Datei: jieba_test.py Projekt: 29n/jieba

 def testTokenize(self):
     for content in test_contents:
         result = jieba.tokenize(content.decode('utf-8'))
         assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
         result = list(result)
         assert isinstance(result, list), "Test Tokenize error on content: %s" % content
         for tk in result:
             print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
     print  >> sys.stderr, "testTokenize"

Beispiel #37

0

Datei anzeigen

Datei: text_utils.py Projekt: djk111/pycorrector

def tokenize(sentence, mode='default'):
    """
    切词并返回切词位置
    :param sentence:
    :param mode:
    :return: (word, start_index, end_index) model='search'
    """
    import logging
    jieba.default_logger.setLevel(logging.ERROR)
    return list(jieba.tokenize(sentence, mode=mode))

Beispiel #38

0

Datei anzeigen

Datei: jieba_tokenizer.py Projekt: githubclj/rasa_nlu

    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        if self.dictionary_path is not None:
            self.load_custom_dictionary(self.dictionary_path)

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens

Beispiel #39

0

Datei anzeigen

Datei: my_test.py Projekt: gphmath/jieba

def how_to_use():
    """待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。
    注意：不建议直接输入 GBK 字符串，可能无法预料地错误解码成 UTF-8

jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator，可以使用 for 循环来获得分词后得到的每一个词语(unicode)，
或者用jieba.lcut 以及 jieba.lcut_for_search 直接返回 list"""
    dict_path = 'user_dict/user_dict.txt'

    seg_list = jieba.cut("我换不行北京清华大学", cut_all=False)
    print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    jieba.load_userdict(dict_path)

    seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
    print("Full Mode: " + "/ ".join(seg_list))  # 全模式

    seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
    print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    seg_list = jieba.lcut("他来到了网易杭研大厦")  # 默认是精确模式
    print(", ".join(seg_list))
    print(type(seg_list))
    print(seg_list)

    seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
    print(", ".join(seg_list))

    seg_list = jieba.cut("我换不行北京清华大学", cut_all=False)
    print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    words = pseg.cut("我爱北京天安门")
    print(words)
    for word, flag in words:
        print('%s %s' % (word, flag))

    print('分词：默认模式')
    result = jieba.tokenize(u'永和服装饰品有限公司')
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
    print('分词：搜索模式')
    result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))

Beispiel #40

0

Datei anzeigen

Datei: test.py Projekt: AllenCHM/WebCmdMining

def test5():
    #默认模式
    result = jieba.tokenize(u'永和服装饰品有限公司')
    for tk in result:
        print('word %s\t\t start:%d \t\t end:%d' %(tk[0], tk[1], tk[2]))
    print
    print
    #搜索模式
    result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')
    for tk in result:
        print('word %s\t\t start:%d \t\t end:%d' %(tk[0], tk[1], tk[2]))

#功能7： chineseAnalyzer for whoosh搜索引擎
# https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py

# 其他词典
# 占用内存较小的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
# 支持繁体分词更好的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
# jieba.set_dictionary('data/dict.txt.big')

Beispiel #41

0

Datei anzeigen

Datei: tokenization.py Projekt: UIUC-data-mining/SegPhrase-MultiLingual

def main(argv):
    rawTextInput = 'rawText.txt'
    argc = len(argv)
    for i in xrange(argc):
        if argv[i] == "-i" and i + 1 < argc:
          rawTextInput = argv[i + 1]
        elif argv[i] == "-o" and i + 1 < argc:
          tokenizedFile = argv[i + 1]
        elif argv[i] == "-map" and i + 1 < argc:
          mappingFile = argv[i + 1]
        elif argv[i] == "-offset" and i + 1 < argc:
          offsetFile = argv[i + 1]
    with codecs.open(mappingFile, encoding='utf-8', mode='r') as input:
        for line in input:
            elements = line.strip().split(',')
            mapping[elements[1]] = mapping[elements[0]]

    
    outputA = codecs.open(offsetFile, encoding='utf-8', mode='w')
    outputB = codecs.open(tokenizedFile, encoding='utf-8', mode='w')
    for line in codecs.open(rawTextInput, encoding='utf-8', mode='r'):
        result = jieba.tokenize(line.strip())
        offsets = []
        newline = []
        for tk in result:
            tk = tk[0]
            begin = tk[1]
            end = tk[2]
            if tk == ' ':
                newline.append(' ')
                continue
            if tk in punctuations:
                newline.append(tk)
                continue
            tk = ''.join([i for i in tk if not i.isdigit()]).lower()
            if len(tk) == 0:
                newline.append(' ')
                continue
            if tk not in mapping:
                newline.append('zzzzzzzzzzz')
            else:
                newline.append(mapping[tk])
                offsets.append((tk, begin, end))
            newline.append(' ')
        outputA.write(u''.join(newline).encode('utf8'))
        outputA.write('\n')

        for (string, begin, end) in offsets.iteritems():
            outputB.write(string)
            outputB.write(',')
            outputB.write(begin)
            outputB.write(',')
            outputB.write(end)
            outputB.write('\t')
        outputB.write('\n')

Beispiel #42

0

Datei anzeigen

Datei: analyzer.py Projekt: LoyukiL/bot_reorganized

 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w) and len(w)<=1:
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token

Beispiel #43

0

Datei anzeigen

Datei: demo.py Projekt: GaussDing/jieba

def segment_text_desc(word):
    """
    文本描述切词
    :param word:
    :return:
    """
    key_words = jieba.tokenize(word)

    for item in key_words:
        print item
    return list(key_words)

Beispiel #44

0

Datei anzeigen

Datei: createFtable.py Projekt: LiXingNi/miniSearchEngine

def dealOnePage(control_obj,dataObj):
    title = dataObj[1].lower()
    cur_page_dic = PageDic()
        
    seg_list = jieba.tokenize(title,mode = "search")  #get segment list

    for tk in seg_list:
        ############################################### deal for a word in a page
        control_obj.addWordIdDic(tk[0]) #add to word to wrodId dictionary
        cur_page_dic.addPageItem(tk[0],tk[1],tk[2])
        
    #add current word-hits dictionary to an object
    control_obj.addPageDic(dataObj[0],cur_page_dic.page_dic)

Beispiel #45

0

Datei anzeigen

Datei: open_weiboscope.py Projekt: anukat2015/sunny-side-up

 def segment_hanzi(txt):
     """
     Tokenizes Chinese text
     
     Args:
         txt -- Chinese text with Chinese characters in it (unicode)
         
     Returns:
         list of unicode, in which each element is a token of txt
     """
     tokens = jieba.tokenize(txt)
     tokens_hanzi = [tkn[0] for tkn in tokens]
     return tokens_hanzi

Beispiel #46

0

Datei anzeigen

Datei: util_package.py Projekt: jeffrey82221/WIR_FINAL

def tokenize(name, stopwords):
    # this function tokenize chinese sentences and remove the stopwords
    try:
        original_tokens = jieba.tokenize(name)
    except ValueError:
        print(name,'not a uni-code')
        return
    tokens = []
    for term in original_tokens:
        if term[0] in stopwords:
            None
        else:
            tokens.append(term[0])
    return tokens

Beispiel #47

0

Datei anzeigen

Datei: q3.py Projekt: xiaojunzhao/Bomoda-Weibo-Project

    def mapper_getterm(self,key,comment_list):
        for comment in comment_list:
            try:
                status_text = comment['status']['text']

                filtered_status_text = re.sub(r'[^A-Za-z\s]+','',status_text)\
                                       .lower()
                comment_text = comment['text']
                if re.match('michael kors',filtered_status_text):
                    for word in jb.tokenize(unicode(comment_text)):
                            # filter Chinese terms and remove the stopwords
                            cond1 = re.match(ur'[\u4e00-\u9fff]+',word[0])
                            cond2 = word[0] not in stop_words
                            if cond1 and cond2:
                                yield ('michael',word[0]),1
                if re.match('kate spade',filtered_status_text):
                    for word in jb.tokenize(unicode(comment_text)):
                            # filter Chinese terms and remove the stopwords
                            cond1 = re.match(ur'[\u4e00-\u9fff]+',word[0])
                            cond2 = word[0] not in stop_words
                            if cond1 and cond2:
                                yield ('kate',word[0]),1
            except:
                pass

Beispiel #48

0

Datei anzeigen

Datei: clean_message_data.py Projekt: futurulus/colors-in-context

def mixed_lang_word_count(string):
    """
    Returns the word count of a string containing English and Chinese words. The string is split into English and Chinese,
    then returns the sum of the word counts from both substrings based on NLTK and Jieba.
    E.g. '你好 Andrew' returns 2, as '你好' is one word and 'Andrew' is another.

    :param string: a string containing english and chinese
    :returns: the word count
    """
    english_only = re.sub(r'\W+', '', string)
    num_eng_words = len(nltk.word_tokenize(english_only))

    non_english_only = re.sub(r'\w+', '', string)
    num_non_eng_words = len(list(jieba.tokenize(non_english_only.decode('utf-8'))))

    return num_eng_words + num_non_eng_words

Beispiel #49

0

Datei anzeigen

Datei: jiebaserver.py Projekt: gumblex/pywebapps

def handle(data):
    oper = json.loads(data)
    if oper[0] == 'cut':
        return json.dumps(tuple(jieba.cut(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'cut_for_search':
        return json.dumps(tuple(jieba.cut_for_search(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'tokenize':
        return json.dumps(tuple(jieba.tokenize(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'

Beispiel #50

0

Datei anzeigen

Datei: aestis_weiboIterator.py Projekt: psmart101/ChinaRepo2

def question3(weiboPostList, chineseFashionTerms):
    # Tokenize and count terms included in posts about each brand.

    ignoreMoreTerms = True # Set to 'true' to exclude more common chinese terms.

    # Dump commonly-occurring tokens and symbols.
    ignoreTerms = chineseFashionTerms + [" ", "#", "@", ".", "。", "&", "spade", "回复", "【", "的", "，", "/", "]",
                                         "[", "!", ":", "：", "�", "~", "～", "`", "、", "】", "a", "t", "c", "h",
                                         "！", "cn", "http", ",", "哦", "了", "”", "“", ">", "$"]
    if ignoreMoreTerms:
        # Exclude these common Chinese pronouns, particles, and verbs. (and, is, he, she, no, also, etc.)
        ignoreTerms.extend(["你", "我", "他", "她", "它", "都", "有", "是", "和", "在", "没", "不", "也", "日", "就",
                            "你们", "2015", "会", "为"])

    # Exclude other variations on the brand names.
    for term in ["Michael", "Kate", "Kors", "MK"]:
        ignoreTerms.append(term)
        ignoreTerms.append(term.upper())
        ignoreTerms.append(term.lower())

    for brand in ["Michael Kors", "Kate Spade"]:
        tokenFrequencies = {}  # Create a new dict to store the amount of occurrences of each token.
        if brand == "Michael Kors":
            currPostList = filter(lambda x: x.hasKors, weiboPostList)
        else:
            currPostList = filter(lambda x: x.hasSpade, weiboPostList)

        for post in currPostList:
            postText = unicode(post.text)  # encode to unicode (so as to be parseable by jieba)
            postTokens = jieba.tokenize(postText)  # collect tokenization results for that post in postTokens.
            for token in postTokens:
                if token[0] in ignoreTerms:
                    continue  # Drop ignored terms.
                if token[0] in tokenFrequencies: # If it already exists in dict, +1
                    tokenFrequencies[token[0]] += 1
                else: # Else, create an entry
                    tokenFrequencies[token[0]] = 1
        print
        print "Token frequency data for", brand+":"
        for data in sorted(tokenFrequencies.iteritems(), key=lambda tup: tup[1], reverse=True)[:12]:
            # Printed top 12 here to quickly exclude values like "<?>" ("can't display UTF" character)
            print "Word:", data[0], "\t", "Frequency:", data[1]

Beispiel #51

0

Datei anzeigen

Datei: tokenization.py Projekt: UIUC-data-mining/SegPhrase-MultiLingual

def main(argv):
    rawTextInput = 'rawText.txt'
    argc = len(argv)
    for i in xrange(argc):
        if argv[i] == "-i" and i + 1 < argc:
          rawTextInput = argv[i + 1]
        elif argv[i] == "-o" and i + 1 < argc:
          tokenizedFile = argv[i + 1]
        elif argv[i] == "-map" and i + 1 < argc:
          mappingFile = argv[i + 1]
    with codecs.open(tokenizedFile, encoding='utf-8', mode='w') as output:
        id = 0
        for line in codecs.open(rawTextInput, encoding='utf-8', mode='r'):
            result = jieba.tokenize(line.strip())
            newline = []
            for tk in result:
                tk = tk[0]
                if tk == ' ':
                    newline.append(' ')
                    continue
                if tk in punctuations:
                    newline.append(tk)
                    continue
                tk = ''.join([i for i in tk if not i.isdigit()]).lower()
                if len(tk) == 0:
                    newline.append(' ')
                    continue
                if tk not in mapping:
                    mapping[tk] = id2alpha(id)
                    id += 1
                newline.append(mapping[tk])
                newline.append(' ')
            output.write(u''.join(newline).encode('utf8'))
            output.write('\n')
    with codecs.open(mappingFile, encoding='utf-8', mode='w') as output:
        for (string, token) in mapping.iteritems():
            output.write(token.encode('utf8'))
            output.write(',')
            output.write(string)
            output.write('\n')

Beispiel #52

0

Datei anzeigen

Datei: mosesserver.py Projekt: gumblex/pywebapps

def handlemsg(data):
    oper = loadsjson(data)
    if oper[0] == 'c2m':
        return dumpsjson(mc.c2m.translate(*oper[1:]))
    elif oper[0] == 'm2c':
        return dumpsjson(mc.m2c.translate(*oper[1:]))
    elif oper[0] == 'c2m.raw':
        return dumpsjson(mc.c2m.rawtranslate(oper[1]))
    elif oper[0] == 'm2c.raw':
        return dumpsjson(mc.m2c.rawtranslate(oper[1]))
    elif oper[0] == 'modelname':
        return dumpsjson(mc.name())
    elif oper[0] == 'cut':
        return dumpsjson(tuple(jieba.cut(*oper[1], **oper[2])))
    elif oper[0] == 'cut_for_search':
        return dumpsjson(tuple(jieba.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'tokenize':
        return dumpsjson(tuple(jieba.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut':
        return dumpsjson(tuple(jiebazhc.cut(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut_for_search':
        return dumpsjson(
            tuple(jiebazhc.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.tokenize':
        return dumpsjson(tuple(jiebazhc.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'
    else:
        return dumpsjson('Command not found')

Beispiel #53

0

Datei anzeigen

Datei: sample_jieba.py Projekt: Datartisan/code-samples

虽说房价居高不下让许多受访者观望，但是打算今年尽快出手的受访者也不少。“最近去售楼处咨询发现年前的优惠减少了不少，按照这趋势，开发商很有可能涨价，再等估计就更买不起。”黄小姐告诉记者，厦门气候宜居、房源供应不足又深受异地购房者的青睐，房价下跌的可能性非常小。加上今年是落户厦门的最后机会，为了赶上落户的“末班车”，还是尽早稳妥。
"""

# jieba.analyse.set_stop_words("stop_words.txt")
#
# seg_res = jieba.cut(raw_text)  # 默认是精确模式
#
# seg_list = list(seg_res)
#
# print(seg_list)
#
# seg_freq_counter = Counter(seg_list)
#
#
# print(seg_freq_counter)

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
words = jieba.tokenize(raw_text)


seg_list = []
for (w, start_pos, stop_pos) in words:
    if not accepted_chars.match(w) and len(w) <= 1:
        continue

    # @todo 可以使用 isdigit 判断是否为数字

    seg_list.append(w)

seg_freq_counter = Counter(seg_list)
print(seg_freq_counter)

Beispiel #54

0

Datei anzeigen

def cuttest(test_sent):
    global g_mode
    test_sent = test_sent.decode('utf-8')
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])

Beispiel #55

0

Datei anzeigen

Datei: doc_to_mongo.py Projekt: dmquant/nci

fileList = []
i = 0

for filename in filenameList:
    basename = os.path.basename(filename)
    reader = open(filename)
    xmldict = xmltodict.parse(reader.read())
    reader.close()
    if xmldict['document']['header']['resultId'] == 'R0000':
        try:
            seqid = xmldict['document']['header']['sequenceId']
            filedate = xmldict['document']['header']['timestamp'][0:8]
            article = xmldict['document']['articles']['article']
            titledata = xmldict['document']['basicinfo']['title']
            postdata = article['topicinfo']['postdata']
            dictCutTitle = jiebaToList(jieba.tokenize(titledata,mode='default'))
            dictCutData = jiebaToList(jieba.tokenize(postdata,mode='default'))
            posttime = xmldict['document']['articles']['article']['topicinfo']['posttime']
            tmpDir = XML_DEST_DIR + filedate
            cutResult = {'result':'1','posttime':posttime,'cuttitle':dictCutTitle,'cutdata':dictCutData}
            dictConceptStat = {}
            listConceptWord = []
            for word in dictCutTitle:
                if dictConceptStat.has_key(word['word']):
                    	dictConceptStat[word['word']] = dictConceptStat[word['word']] + 2
                else:
                    	if dictConceptList.has_key(word['word']):
                        	dictConceptStat[word['word']] = 2
            
            for word in dictCutData:
                if dictConceptStat.has_key(word['word']):

Beispiel #56

0

Datei anzeigen

Datei: wordcount_comments.py Projekt: xz1082/BomodaDataExercise

#remove white space, convert to lower letters and remove nonalpha characters
comments = []
for comment in array:
    lower = comment.lower()
    comments.append(''.join([i for i in lower if i.isalpha()]))

#retrieve mentioned Chinese terms associated with each brand from all texts
mk = ['michaelkors', 'mk']
ks = ['katespade', 'ks']
mk_dict = {}
ks_dict = {}
#tokenize text in each weibo post
for sentence in comments:
    #create a counter object to count the occurrence of each term in texts
    c = Counter()
    result = jieba.tokenize(sentence)
    #create a list to store tokenized terms and their frequencies
    word = []
    for tk in result:
        #print "word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2])
        word.append(tk[0])
    #update the counter object with new terms
    c.update(word)

    #find the number of co-occurrences with mk or ks for every token in all texts
    for key in c.keys():
        for word, count in c.most_common(10):
            if any(brand in key for brand in mk):
                mk_dict[word] =  count
            elif any(brand in key for brand in ks):
                ks_dict[word] = count

Beispiel #57

0

Datei anzeigen

Datei: test_tokenize_no_hmm.py Projekt: kevinljw/NLP-15-ChiSentCorrection

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Beispiel #58

0

Datei anzeigen

Datei: jiebaDemo.py Projekt: qhczzqpjr/MyRepository

#jieba.enable_parallel(4) 

seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
print "Full Mode:", "/ ".join(seg_list) #全模式

seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
print "Default Mode:", "/ ".join(seg_list) #精确模式

seg_list = jieba.cut("他来到了网易杭研大厦") #默认是精确模式
print ", ".join(seg_list)

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") #搜索引擎模式
print ", ".join(seg_list)

seg_list = jieba.cut_for_search("李小福是创新办主任也是云计算方面的专家")
print ", ".join(seg_list)

# load customer dict file
jieba.load_userdict("G:\GitHub\MyRepository\Python\TestFiles\user_dict.txt")
seg_list = jieba.cut_for_search("李小福是创新办主任也是云计算方面的专家")
print ", ".join(seg_list)

import jieba.posseg as pseg
words =pseg.cut("我爱北京天安门")
for w in words:
    print w.word,w.flag

result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])

Beispiel #59

0

Datei anzeigen

Datei: dispatcher.py Projekt: messense/jieba-rpc

 def tokenize(self, sentence, mode='default', HMM=True):
     sentence = to_text(sentence)
     tokens = jieba.tokenize(sentence, mode=mode, HMM=HMM)
     return list(tokens)

Beispiel #60

0

Datei anzeigen

Datei: NLP.py Projekt: zhilinwang/emotion-analysis

def tokenize(l):
    words=[]
    for s in l:
        words.append(jieba.tokenize(s))
    return words