Beispiel #1
0
        def worker(item):
            size = 0
            ids, masked_ids = [], []
            index = item[1]
            item = item[0]

            min_index = item.offsets[1][
                0]  #get original index of first word in encoded segment
            max_index = max(item.offsets)[
                1]  #get original index of last word in encoded segment
            words = list(jieba.tokenize(data[index][min_index:max_index]))
            arr = np.array(item.ids, dtype=np.int32)
            if (np.count_nonzero(arr) > 10):
                masked_id = mask_ids(item, words)
                if masked_id is not None:
                    ids.append(arr)
                    masked_ids.append(np.array(masked_id, dtype=np.int32))
                size += 1

            for overflowing in item.overflowing:
                min_index = overflowing.offsets[1][0]
                max_index = max(overflowing.offsets)[1]
                words = list(jieba.tokenize(data[index][min_index:max_index]))
                arr = np.array(overflowing.ids, dtype=np.int32)
                if (np.count_nonzero(arr) > 10):
                    masked_id = mask_ids(overflowing, words)
                    if masked_id is not None:
                        ids.append(arr)
                        masked_ids.append(np.array(masked_id, dtype=np.int32))
                    size += 1

            return ids, masked_ids, size
Beispiel #2
0
def word_process(text):
    """

    :param text:
    :return:
    """
    result = []
    print(jieba.tokenize(text))
    for (word, start, end) in jieba.tokenize(text):
        pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
        result.append((pseg_data, start, end))

    # result = word_process('我明天去吃饭')
    print(result)
    raw_entities = []
    for (item_posseg, start, end) in result:
        part_of_speech = ["nr", "ns", "nt", "t"]
        for (word_posseg, flag_posseg) in item_posseg:
            print(word_posseg)
            print(flag_posseg)
            if flag_posseg in part_of_speech:
                raw_entities.append({
                    'start': start,
                    'end': end,
                    'value': word_posseg,
                    'entity': flag_posseg
                })
    print(raw_entities)
Beispiel #3
0
def getTest_feature(test_data):
    paragraphs = []
    questions = []
    test_ids = []
    # store each data row temporally
    tmp_x_row = []

    #get data position
    subjects = test_data['data']

    for subject in subjects:
        # subject contains title and *paragraphs*

        for paragraph in subject['paragraphs']:
            # paragraphs contains *context* and *qas*
            context = list(
                jieba.tokenize(paragraph['context'].replace("\n", "")))

            for qa in paragraph['qas']:

                ######################################
                paragraphs.append(context)
                questions.append(list(jieba.tokenize(qa['question'])))
                #######################################
                test_ids.append(
                    qa['id']
                )  # append question:string to tmp_x_row (behind context:string)

                #check if every question have unique answer
    return paragraphs, questions, test_ids
def analyse_fenci():
    # 2.4 分词分析:进一步,我们需要对文本信息进行相关分析,如返回词语所在位置、返回关键词等等。
    # 2.4.1返回词语所在位置
    import jieba.analyse
    print("1.采取精准模式结果:")
    # print([item for item in jieba.tokenize(u"数据分析与数据挖掘的应用")])
    for item in jieba.tokenize(u"数据分析与数据挖掘的应用"):
        print item[0], item[1], item[2]
    print("-------------------")
    print("2.采取搜索模式结果:")
    # print([item for item in jieba.tokenize("数据分析与数据挖掘的应用", mode="search")])
    for item in jieba.tokenize(u"数据分析与数据挖掘的应用", mode="search"):
        print item[0], item[1], item[2]

    # 2.4.2提取文本中的关键词
    print '提取文本中的关键词:'  #其结果是结合文中出现的词频与字典中的词频进行排序
    import jieba.analyse
    # print(jieba.analyse.extract_tags("我喜欢广州小蛮腰", 3))
    # print(jieba.analyse.extract_tags("我喜欢广州广州小蛮腰", 3))
    # print(jieba.analyse.extract_tags("我喜欢广州广州广州小蛮腰", 3))
    for item in jieba.analyse.extract_tags("我喜欢广州小蛮腰", 3):
        print item + ' ',
    print ''
    for item in jieba.analyse.extract_tags("我喜欢广州广州小蛮腰", 3):
        print item + ' ',
    print ''
    for item in jieba.analyse.extract_tags("我喜欢广州广州广州小蛮腰", 3):
        print item + ' ',
    print ''
Beispiel #5
0
def fun5():
    print("默认的tokenize")
    result = jieba.tokenize(u"自然语言处理非常有用")
    for tk in result:
        print('%s\t\t atart: %d \t\t end:%d' % (tk[0], tk[1], tk[2]))

    print("\n------------分割线-------------\n")
    print("搜索模式的tokenize")
    result = jieba.tokenize(u"自然语言处理非常有用", mode='search')
    for tk in result:
        print('%s\t\t atart: %d \t\t end:%d' % (tk[0], tk[1], tk[2]))
Beispiel #6
0
 def _get_same_words_with_cut(self, source: str, target: str):
     """
     使用结巴分词来抽取相同词
     """
     res_words: [Word] = []
     source_cut = [Word(*word) for word in jieba.tokenize(source)]
     target_cut = [Word(*word) for word in jieba.tokenize(target)]
     for word in source_cut:
         if word in target_cut and word.text not in STOPWORDS and len(word.text) >= self.least_word_len:
             res_words.append(word)
     return res_words
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import jieba

        text = message.get(attribute)
        if self.component_config.get("case_sensitive", False):
            tokenized = jieba.tokenize(text.lower())
        else:
            tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return self._apply_token_pattern(tokens)
Beispiel #8
0
def tokenize():
    """
    分词
    :return:
    """
    s = "周大福是创新办主任也是云计算方面的专家"
    result = jieba.tokenize(s)
    logger.info("普通模式")
    for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))

    logger.info("\n搜索模式")
    result = jieba.tokenize(s, mode='search')
    for tk in result: logger.info("word: {0} \t\t start: {1} \t\t end: {2}".format(tk[0], tk[1], tk[2]))
Beispiel #9
0
def jieba_fenci_for_crawl_doc(doc):
    """结巴分词
    """
    for lib in JIEBA_CUSTOM_LIBS:
        prodict = os.path.join(settings.STATICFILES_DIRS[0], 'jiebadic', lib[0])
        try:
            jieba.load_userdict(prodict)
        except IOError:
            continue

    rs = ['\xa0',
          '一、',
          '二、',
          '三、',
          '四、',
          '五、',
          '六、',
          '七、',
          '八、',
          '九、',
          '十、']
    for r in rs:
        doc = doc.replace(r, '')
    regex = re.compile(r'[\n\r\t,.:\-";()。、:,的<>》《()]')  # 去除换行 回车 制表符 中文标点符号
    t = regex.sub("", doc)
    fenci_data = jieba.tokenize(t)  # 结巴分词
    return fenci_data
def getChList(docStrByte):
    ## 传入一个文档的二进制代码,返回中文分词后的结果,用空格把中文分词的词

    inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#二进制转为字符串,英文字母转为小写
    strList = ''.join(inputStr.split('\n'))#删去换行符,连接每行成为一个段落
    rawTokens = list(jieba.tokenize(strList))#中文分词

    #stopWord 是 一个字典,每个key 是一个停用词,value都是None
    fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read()
    stopWord = {}.fromkeys(fSW.split('\n'))
    stopWord[''] = None

    final = []
    s = nltk.stem.SnowballStemmer('english')
    for seg in rawTokens:
        # print(seg[0].strip())
        rawWord = seg[0].strip()#strip()函数,去除字符串前后的空格
        if (rawWord.isalpha()):#如果是英文单词,则提取词干
            word = s.stem(rawWord)
        else:
            word = rawWord

        if  word not in stopWord:#去除停用词
            final.append(word)#最后返回list
    return final
Beispiel #11
0
def high_freq_words():
    sentence = '我喜欢苏州的苏州中心,上海,上海的东方明珠'
    words = jieba.analyse.extract_tags(sentence, topK=3)
    print(f'top 3 的词语 {words}')
    # 返回词语的位置
    words_loc = jieba.tokenize(sentence)
    print(f'各个词语的位置{list(words_loc)}')
Beispiel #12
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens
Beispiel #13
0
    def __call__(self, text, **kargs):
        token  = Token()

        words = set()
        words_list = []

        for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'):
            i = i.strip()
            if not i:
                continue
            if i in words:
                continue
            if i in punct:
                continue
            words.add(i)
            words_list.append(i)

        for w in words:
            if not accepted_chars.match(w):
                if len(w) <= 1:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
Beispiel #14
0
def entity_rec(request):
    req = request.body
    return req
    print(req)
    entity_d = {
        'person': [],
        'fund': [],
        'company': [],
        'industry': [],
        'stock': []
    }
    index_l = [0 for i in range(len(news))]
    result = jieba.tokenize(news)
    start = time.time()
    for k in result:
        if k[0] in person_list:
            entity_d['person'] = entity_d['person'] + [k[0]]
            index_l[k[1]:k[2]] = [1 for k in range(k[2] - k[1])]
        if k[0] in fund_list:
            entity_d['fund'] = entity_d['fund'] + [k[0]]
            index_l[k[1]:k[2]] = [2 for k in range(k[2] - k[1])]
        if k[0] in company_list:
            entity_d['company'] = entity_d['company'] + [k[0]]
            index_l[k[1]:k[2]] = [3 for k in range(k[2] - k[1])]
        if k[0] in industry_list:
            entity_d['industry'] = entity_d['industry'] + [k[0]]
            index_l[k[1]:k[2]] = [4 for k in range(k[2] - k[1])]
        if k[0] in stock_list:
            entity_d['stock'] = entity_d['stock'] + [k[0]]
            index_l[k[1]:k[2]] = [5 for k in range(k[2] - k[1])]
    print("--- %s seconds ---" % (time.time() - start))
    print(json.dumps({'entity_d': entity_d, 'index_l': index_l}))
    return json.dumps({'entity_d': entity_d, 'index_l': index_l})
 def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
             document_id: str, user_id: str):
     result = jieba.tokenize(cas.sofa_string)
     for tk in result:
         prediction = self.create_prediction(cas, layer, feature, tk[1],
                                             tk[2], tk[0])
         cas.add_annotation(prediction)
def summarize(text, cut_search, window=100):
    content = get_content(doc.get('path'))
    tokres = jieba.tokenize(content, mode='search')
    search_words = {}
    for i in range(len(cut_search)):
        search_words[cut_search[i]] = i
    kaps = []
    for x in tokres:
        if x[0] in search_words.keys():
            kaps.append((x[1], x[2], search_words[x[0]]))
    kaps.sort(key=(lambda x: x[0]))
    nextitem = 0
    maxv, s, e = 0, 0, 0
    for i in range(len(kaps)):
        end = kaps[i][0] + window
        while nextitem < len(kaps) and kaps[nextitem][1] <= end:
            nextitem += 1
        exc, rni = nextitem - i, nextitem
        while rni < len(kaps) and kaps[rni][0] < end:
            if kaps[rni][1] <= end:
                exc += 1
            rni += 1
        if exc > maxv:
            maxv, s, e = exc, i, rni
    lens = kaps[s][0]
    kaps = kaps[s:e]
    maxk = max((x[1] for x in kaps if x[1] <= lens + window))
    lens -= (lens + window - maxk) / 2
    if lens + window > len(content):
        lens = len(content) - window
    if lens < 0:
        lens = 0
    return maxv, lens, len(content), content[lens:lens + window], kaps
Beispiel #17
0
def _load_cedict(filename, hsk=None):
    cedict = defaultdict(list)
    with open(filename, 'r', encoding="utf-8") as f:
        for line in f:
            if line.startswith('#'):
                continue
            tr, sm, py, transl = re.match(r"(\S*) (\S*) \[(.*)\] \/(.*)\/",
                                          line).groups()
            transl = transl.split('/')
            transl = '/'.join([
                t for t in transl if not t.startswith('see also ')
                and not t.startswith('variant of')
            ])
            cedict[sm].append((tr, py, transl))

    # Find compounds with jieba
    num = 0
    compound_parts = {}
    for sm, entries in cedict.items():
        # search mode will produce compounds and their parts
        tokens = list(jieba.tokenize(sm, mode='search'))
        parts = [t for t in tokens if t[2] - t[1] < len(sm)]
        compound_parts[sm] = parts

    # Join multiple sound characters (多音字)
    cedict = {
        sm: (sm, entries, compound_parts[sm])
        for sm, entries in cedict.items()
    }
    return cedict
def build_word_level_vocabulary_all(train_file,
                                    valid_file=None,
                                    test_file=None):
    sentences = list()

    with codecs.open(train_file, encoding='utf-8') as f_train:
        for line in f_train:
            x = json.loads(line)
            sentences.extend([x['A'].strip(), x['B'].strip(), x['C'].strip()])
    if valid_file:
        with codecs.open(valid_file, encoding='utf-8') as f_valid:
            for line in f_valid:
                x = json.loads(line)
                sentences.extend(
                    [x['A'].strip(), x['B'].strip(), x['C'].strip()])
    if test_file:
        with codecs.open(test_file, encoding='utf-8') as f_test:
            for line in f_test:
                x = json.loads(line)
                sentences.extend(
                    [x['A'].strip(), x['B'].strip(), x['C'].strip()])
    corpus = u''.join(sentences)
    word_list = list(set([tk[0] for tk in jieba.tokenize(corpus)]))

    return dict((word, idx + 1) for idx, word in enumerate(word_list))
Beispiel #19
0
def tokenize(sentence, mode='default'):
    """
    切词并返回切词位置
    :param sentence:
    :return: (word, start_index, end_index) model='search'
    """
    return list(jieba.tokenize(sentence, mode=mode))
Beispiel #20
0
    def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:
        import jieba

        text = self.preprocess_text(text, attribute)
        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens
Beispiel #21
0
def tokenize(sentence):
    """
    切词
    :param sentence:
    :return: (word, start_index, end_index) model='search'
    """
    return list(jieba.tokenize(sentence, mode='search'))
Beispiel #22
0
 def split(self, input_s):
     self.s = input_s
     self.token = jieba.tokenize(self.s)
     num_en = 0
     num_zh = 0
     for t in self.token:
         if not t[0].isspace():
             if t[0] in ',,"\'‘’“”#@%<>《》{}【】[]。,!!??':
                 self.symbol.append(t)
             else:
                 lang = langid.classify(t[0])[0]
                 if lang == "en":
                     self.english.append(t)
                     num_en += 1
                 elif lang == "zh":
                     self.chinese.append(t)
                     num_zh += 1
                 else:
                     self.other.append(t)
     if num_en == 1 and num_zh == 1:
         code_mix = 1
     if num_en == 0 and num_zh == 0:
         self.note = "other"
     elif num_en > num_zh:
         self.note = "en"
         self.translate_en_zh()
     else:
         self.note = "zh"
         self.translate_zh_en()
Beispiel #23
0
def correct(ss):
    '''
    Correct sentence ss
    '''
    # Returns list of tuples (word, st, en)  mode='search'
    tokens = list(jieba.tokenize(ss))
    print('Segmented sentence is {}'.format(''.join(
        [str(token) for token in tokens])))

    segranges = [[token[1], token[2]] for token in tokens]
    _, _, outranges = score_sentence(ss)
    if outranges:
        cranges = merge_ranges(get_overlap_ranges(outranges, segranges))
        for crange in cranges:
            print('Correct range is {}'.format(crange))
            st, en = crange
            print('Possible wrong segment is {}'.format(ss[st:en]))
            pwrong = ss[st:en]
            # seg_list = jieba.cut(pwrong)
            # error_string = ", ".join(seg_list)
            # errors = error_string.split(", ")
            # cgram = ""
            # for error in errors:
            cgram = auto_correct(pwrong, cn_dict, word_freq)
            ss = ss[:st] + cgram + ss[en:]
            print('Corrected pinyin is {}'.format(cgram))

            cgram2 = correct_ngram_2(ss, st, en)
            print('Corrected ngram is {}'.format(cgram2))
            ss = ss[:st] + cgram2 + ss[en:]
    else:
        cranges = []
        print('No segment to correct.')
    return ss, cranges
Beispiel #24
0
def cuttest(test_sent):
    global g_mode
    for n in re_num.finditer(test_sent):
        print(n.start(), n.end(), n.group())
    result = jieba.tokenize(test_sent, mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
def get_words_jd(input_file):
    max_length_words = 100
    length = 0
    jd_str =""
    jd_position = {}
    jd_words =[]
    with open(input_file) as f:
        lines = f.readlines()
        for line in lines:
            if not line:continue
            line = line.strip()
            line = re.sub(r"\s+","",line)
            line = line.decode('utf-8')
            for word in line:
                jd_str += word.encode('utf-8')
                length += 1
                if length >= max_length_words:break

    result = jieba.tokenize(jd_str.decode('utf-8'))
    for tk in result:
        if tk[0].encode('utf-8') not in stop_words and tk[0].encode('utf-8') not in stop:
            jd_words.append(tk[0].encode('utf-8'))
            if not jd_position.has_key(tk[0].encode('utf-8')):
               jd_position[tk[0].encode('utf_8')] = {"start_pos" : tk[1], "end_pos" : tk[2]}
    return jd_str, jd_words, jd_position
Beispiel #26
0
def test_tokenize():
    """
    测试token。
    :return:
    """
    # 生成token串
    result = jieba.tokenize("永和服装饰品有限公司")
    for tk in result:
        common_logger.info("word {0}\t\t start:{1}\t\t end:{2}".format(
            tk[0], tk[1], tk[2]))

    # 搜索模式的token串
    result = jieba.tokenize("永和服装饰品有限公司", mode="search")
    for tk in result:
        common_logger.info("word {0}\t\t start:{1}\t\t end:{2}".format(
            tk[0], tk[1], tk[2]))
Beispiel #27
0
 def jieba_split(
         self, i: int,
         normalized_string: NormalizedString) -> List[NormalizedString]:
     splits = []
     for token, start, stop in jieba.tokenize(str(normalized_string)):
         splits.append(normalized_string[start:stop])
     return splits
Beispiel #28
0
def test_tfidf():
    lines = open('D:\\Python\\Data\\NBA.txt', encoding='utf-8').read()
    print(type(lines))
    # 基于TF-IDF算法的关键词抽取
    words = analyse.extract_tags(lines, topK=20, withWeight=True, allowPOS=())
    print(words)

    # 基于TextRank算法的关键词抽取
    words = analyse.textrank(lines,
                             topK=20,
                             withWeight=False,
                             allowPOS=('ns', 'n', 'vn', 'v'))
    print(words)
    words = analyse.textrank(lines,
                             topK=20,
                             withWeight=False,
                             allowPOS=('ns', 'n'))
    print(words)

    # 词性标注
    words = pseg.cut('我爱自然语言处理')
    # print(list(words))
    for word, flag in words:
        print(word, flag)

    # Tokenize:返回词语在原文的起止位置
    result = jieba.tokenize('我爱自然语言处理')
    print(list(result))
Beispiel #29
0
def ann_rebuild(filename):
    jieba.load_userdict("jiebadic.txt")
    rf = codecs.open(filename, encoding='utf-8')
    annotation = {}
    for line in rf:
        if line.startswith("T"):
            word = line.strip().split('\t')[-1]
            type_offset = line.strip().split('\t')[1].split(' ')
            type = type_offset[0]
            start = int(type_offset[1])
            final_end = int(type_offset[-1])
            result = jieba.tokenize(word)
            for i, tk in enumerate(result):
                end = start + tk[2] - tk[1]
                if i == 0:
                    type0 = "B-" + type
                    annotation[(start, end)] = type0
                elif end == final_end:
                    type1 = "E-" + type
                    annotation[(start, end)] = type1
                else:
                    type2 = "I-" + type
                    annotation[(start, end)] = type2
                start = end
    rf.close()
    return annotation
Beispiel #30
0
def tokenizer(filename):
    word_counter = collections.defaultdict(int)
    with open(filename) as f:
        for line in f:
            for word in jieba.tokenize(line.decode('utf-8')):
                word_counter[word[0]] += 1
    return word_counter
def feature_embeding(comment):
    size = 15
    par = 1
    data = pd.read_excel('lstm_data/feature_word.xlsx', index=None)
    definite_words = list(data['肯定词'])
    positive_words = list(data['正向'])
    negative_words = list(data['负向'])
    imagine_words = list(data['假想词'])
    deny_words = list(data['否定词'])
    inter_words = list(data['疑问词'])
    assume_words = list(data['假定词'])
    feature_embed = np.zeros((len(comment), maxlen_context, 1 * size))
    for i, t in enumerate(comment):
        token = jieba.tokenize(t[:maxlen_context])
        for tk in token:
            if tk[0] in deny_words:
                feature_embed[i, tk[1]:tk[2], 0:size] = par
            if tk[0] in inter_words:
                feature_embed[i, tk[1]:tk[2], size:2 * size] = par
            if tk[0] in assume_words:
                feature_embed[i, tk[1]:tk[2], 2 * size:3 * size] = par
            if tk[0] in definite_words:
                feature_embed[i, tk[1]:tk[2], 3 * size:4 * size] = par
            if tk[0] in positive_words:
                feature_embed[i, tk[1]:tk[2], 4 * size:5 * size] = par
            if tk[0] in negative_words:
                feature_embed[i, tk[1]:tk[2], 5 * size:6 * size] = par
            if tk[0] in imagine_words:
                feature_embed[i, tk[1]:tk[2], 6 * size:7 * size] = par
    return feature_embed
Beispiel #32
0
def getChList(docStrByte):
    inputStr = str(docStrByte, encoding='gbk', errors='ignore')
    ## filter the first several sentence
    strList = list(i for i in inputStr.split('\n'))
    # print(strList)
    startLine = 0
    for i in range(len(strList)):
        if (strList[i].startswith('【')):
            startLine += 1
        else:
            break
    # print(strList[startLine:])

    rawTokens = list(jieba.tokenize(''.join(strList[startLine:])))
    # stopWord = {}.fromkeys([line for line in open('stopwords.txt','r',encoding = 'gbk', errors = 'ignore')])
    fSW = open('stopwords.txt', 'r', encoding='utf-8', errors='ignore').read()
    # print(fSW.split('\n')[:99])
    stopWord = {}.fromkeys(fSW.split('\n'))

    # print(stopWord)
    stopWord[''] = None
    # for (k,v) in stopWord.items():
    #     print(k, ',', v)
    final = ''
    for seg in rawTokens:
        # print(seg)
        # seg.encode('gbk')
        word = seg[0].strip()
        if word not in stopWord:
            final += (' ' + word)  #if using final is not good
        # else:
        # print(seg)
    # print(type(final))
    return final
def tokenize(sentence,addwords=None):
    if(addwords!=None):
        for word in addwords:
            jieba.add_word(word)
    tokens = []
    for term in jieba.tokenize(sentence):
        tokens.append(term[0])
    return tokens
Beispiel #34
0
 def testTokenize_NOHMM(self):
     for content in test_contents:
         result = jieba.tokenize(content,HMM=False)
         assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
         result = list(result)
         assert isinstance(result, list), "Test Tokenize error on content: %s" % content
         for tk in result:
             print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
     print("testTokenize_NOHMM", file=sys.stderr)
Beispiel #35
0
def segment(raw_text):

    tokens = jieba.tokenize(raw_text)
    seg_list = [w for (w, start_pos, stop_pos) in tokens if token_condition(w)]

    seg_freq_counter = Counter(seg_list)
    seg_freq = dict(seg_freq_counter)

    return json.dumps(seg_freq)
Beispiel #36
0
 def testTokenize(self):
     for content in test_contents:
         result = jieba.tokenize(content.decode('utf-8'))
         assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
         result = list(result)
         assert isinstance(result, list), "Test Tokenize error on content: %s" % content
         for tk in result:
             print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
     print  >> sys.stderr, "testTokenize"
Beispiel #37
0
def tokenize(sentence, mode='default'):
    """
    切词并返回切词位置
    :param sentence:
    :param mode:
    :return: (word, start_index, end_index) model='search'
    """
    import logging
    jieba.default_logger.setLevel(logging.ERROR)
    return list(jieba.tokenize(sentence, mode=mode))
Beispiel #38
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        if self.dictionary_path is not None:
            self.load_custom_dictionary(self.dictionary_path)

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens
Beispiel #39
0
def how_to_use():
    """待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。
    注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8

jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),
或者用jieba.lcut 以及 jieba.lcut_for_search 直接返回 list"""
    dict_path = 'user_dict/user_dict.txt'

    seg_list = jieba.cut("我换不行北京清华大学", cut_all=False)
    print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    jieba.load_userdict(dict_path)

    seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
    print("Full Mode: " + "/ ".join(seg_list))  # 全模式

    seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
    print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    seg_list = jieba.lcut("他来到了网易杭研大厦")  # 默认是精确模式
    print(", ".join(seg_list))
    print(type(seg_list))
    print(seg_list)

    seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
    print(", ".join(seg_list))

    seg_list = jieba.cut("我换不行北京清华大学", cut_all=False)
    print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

    words = pseg.cut("我爱北京天安门")
    print(words)
    for word, flag in words:
        print('%s %s' % (word, flag))

    print('分词:默认模式')
    result = jieba.tokenize(u'永和服装饰品有限公司')
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
    print('分词:搜索模式')
    result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
Beispiel #40
0
def test5():
    #默认模式
    result = jieba.tokenize(u'永和服装饰品有限公司')
    for tk in result:
        print('word %s\t\t start:%d \t\t end:%d' %(tk[0], tk[1], tk[2]))
    print
    print
    #搜索模式
    result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')
    for tk in result:
        print('word %s\t\t start:%d \t\t end:%d' %(tk[0], tk[1], tk[2]))

#功能7: chineseAnalyzer for whoosh搜索引擎
# https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py

# 其他词典
# 占用内存较小的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
# 支持繁体分词更好的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
# jieba.set_dictionary('data/dict.txt.big')
def main(argv):
    rawTextInput = 'rawText.txt'
    argc = len(argv)
    for i in xrange(argc):
        if argv[i] == "-i" and i + 1 < argc:
          rawTextInput = argv[i + 1]
        elif argv[i] == "-o" and i + 1 < argc:
          tokenizedFile = argv[i + 1]
        elif argv[i] == "-map" and i + 1 < argc:
          mappingFile = argv[i + 1]
        elif argv[i] == "-offset" and i + 1 < argc:
          offsetFile = argv[i + 1]
    with codecs.open(mappingFile, encoding='utf-8', mode='r') as input:
        for line in input:
            elements = line.strip().split(',')
            mapping[elements[1]] = mapping[elements[0]]

    
    outputA = codecs.open(offsetFile, encoding='utf-8', mode='w')
    outputB = codecs.open(tokenizedFile, encoding='utf-8', mode='w')
    for line in codecs.open(rawTextInput, encoding='utf-8', mode='r'):
        result = jieba.tokenize(line.strip())
        offsets = []
        newline = []
        for tk in result:
            tk = tk[0]
            begin = tk[1]
            end = tk[2]
            if tk == ' ':
                newline.append(' ')
                continue
            if tk in punctuations:
                newline.append(tk)
                continue
            tk = ''.join([i for i in tk if not i.isdigit()]).lower()
            if len(tk) == 0:
                newline.append(' ')
                continue
            if tk not in mapping:
                newline.append('zzzzzzzzzzz')
            else:
                newline.append(mapping[tk])
                offsets.append((tk, begin, end))
            newline.append(' ')
        outputA.write(u''.join(newline).encode('utf8'))
        outputA.write('\n')

        for (string, begin, end) in offsets.iteritems():
            outputB.write(string)
            outputB.write(',')
            outputB.write(begin)
            outputB.write(',')
            outputB.write(end)
            outputB.write('\t')
        outputB.write('\n')
Beispiel #42
0
 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w) and len(w)<=1:
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Beispiel #43
0
def segment_text_desc(word):
    """
    文本描述切词
    :param word:
    :return:
    """
    key_words = jieba.tokenize(word)

    for item in key_words:
        print item
    return list(key_words)
def dealOnePage(control_obj,dataObj):
    title = dataObj[1].lower()
    cur_page_dic = PageDic()
        
    seg_list = jieba.tokenize(title,mode = "search")  #get segment list

    for tk in seg_list:
        ############################################### deal for a word in a page
        control_obj.addWordIdDic(tk[0]) #add to word to wrodId dictionary
        cur_page_dic.addPageItem(tk[0],tk[1],tk[2])
        
    #add current word-hits dictionary to an object
    control_obj.addPageDic(dataObj[0],cur_page_dic.page_dic)   
 def segment_hanzi(txt):
     """
     Tokenizes Chinese text
     
     Args:
         txt -- Chinese text with Chinese characters in it (unicode)
         
     Returns:
         list of unicode, in which each element is a token of txt
     """
     tokens = jieba.tokenize(txt)
     tokens_hanzi = [tkn[0] for tkn in tokens]
     return tokens_hanzi
Beispiel #46
0
def tokenize(name, stopwords):
    # this function tokenize chinese sentences and remove the stopwords
    try:
        original_tokens = jieba.tokenize(name)
    except ValueError:
        print(name,'not a uni-code')
        return
    tokens = []
    for term in original_tokens:
        if term[0] in stopwords:
            None
        else:
            tokens.append(term[0])
    return tokens
Beispiel #47
0
    def mapper_getterm(self,key,comment_list):
        for comment in comment_list:
            try:
                status_text = comment['status']['text']

                filtered_status_text = re.sub(r'[^A-Za-z\s]+','',status_text)\
                                       .lower()
                comment_text = comment['text']
                if re.match('michael kors',filtered_status_text):
                    for word in jb.tokenize(unicode(comment_text)):
                            # filter Chinese terms and remove the stopwords
                            cond1 = re.match(ur'[\u4e00-\u9fff]+',word[0])
                            cond2 = word[0] not in stop_words
                            if cond1 and cond2:
                                yield ('michael',word[0]),1
                if re.match('kate spade',filtered_status_text):
                    for word in jb.tokenize(unicode(comment_text)):
                            # filter Chinese terms and remove the stopwords
                            cond1 = re.match(ur'[\u4e00-\u9fff]+',word[0])
                            cond2 = word[0] not in stop_words
                            if cond1 and cond2:
                                yield ('kate',word[0]),1
            except:
                pass
def mixed_lang_word_count(string):
    """
    Returns the word count of a string containing English and Chinese words. The string is split into English and Chinese,
    then returns the sum of the word counts from both substrings based on NLTK and Jieba.
    E.g. '你好 Andrew' returns 2, as '你好' is one word and 'Andrew' is another.

    :param string: a string containing english and chinese
    :returns: the word count
    """
    english_only = re.sub(r'\W+', '', string)
    num_eng_words = len(nltk.word_tokenize(english_only))

    non_english_only = re.sub(r'\w+', '', string)
    num_non_eng_words = len(list(jieba.tokenize(non_english_only.decode('utf-8'))))

    return num_eng_words + num_non_eng_words
Beispiel #49
0
def handle(data):
    oper = json.loads(data)
    if oper[0] == 'cut':
        return json.dumps(tuple(jieba.cut(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'cut_for_search':
        return json.dumps(tuple(jieba.cut_for_search(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'tokenize':
        return json.dumps(tuple(jieba.tokenize(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'
def question3(weiboPostList, chineseFashionTerms):
    # Tokenize and count terms included in posts about each brand.

    ignoreMoreTerms = True # Set to 'true' to exclude more common chinese terms.

    # Dump commonly-occurring tokens and symbols.
    ignoreTerms = chineseFashionTerms + [" ", "#", "@", ".", "。", "&", "spade", "回复", "【", "的", ",", "/", "]",
                                         "[", "!", ":", ":", "�", "~", "~", "`", "、", "】", "a", "t", "c", "h",
                                         "!", "cn", "http", ",", "哦", "了", "”", "“", ">", "$"]
    if ignoreMoreTerms:
        # Exclude these common Chinese pronouns, particles, and verbs. (and, is, he, she, no, also, etc.)
        ignoreTerms.extend(["你", "我", "他", "她", "它", "都", "有", "是", "和", "在", "没", "不", "也", "日", "就",
                            "你们", "2015", "会", "为"])

    # Exclude other variations on the brand names.
    for term in ["Michael", "Kate", "Kors", "MK"]:
        ignoreTerms.append(term)
        ignoreTerms.append(term.upper())
        ignoreTerms.append(term.lower())

    for brand in ["Michael Kors", "Kate Spade"]:
        tokenFrequencies = {}  # Create a new dict to store the amount of occurrences of each token.
        if brand == "Michael Kors":
            currPostList = filter(lambda x: x.hasKors, weiboPostList)
        else:
            currPostList = filter(lambda x: x.hasSpade, weiboPostList)

        for post in currPostList:
            postText = unicode(post.text)  # encode to unicode (so as to be parseable by jieba)
            postTokens = jieba.tokenize(postText)  # collect tokenization results for that post in postTokens.
            for token in postTokens:
                if token[0] in ignoreTerms:
                    continue  # Drop ignored terms.
                if token[0] in tokenFrequencies: # If it already exists in dict, +1
                    tokenFrequencies[token[0]] += 1
                else: # Else, create an entry
                    tokenFrequencies[token[0]] = 1
        print
        print "Token frequency data for", brand+":"
        for data in sorted(tokenFrequencies.iteritems(), key=lambda tup: tup[1], reverse=True)[:12]:
            # Printed top 12 here to quickly exclude values like "<?>" ("can't display UTF" character)
            print "Word:", data[0], "\t", "Frequency:", data[1]
def main(argv):
    rawTextInput = 'rawText.txt'
    argc = len(argv)
    for i in xrange(argc):
        if argv[i] == "-i" and i + 1 < argc:
          rawTextInput = argv[i + 1]
        elif argv[i] == "-o" and i + 1 < argc:
          tokenizedFile = argv[i + 1]
        elif argv[i] == "-map" and i + 1 < argc:
          mappingFile = argv[i + 1]
    with codecs.open(tokenizedFile, encoding='utf-8', mode='w') as output:
        id = 0
        for line in codecs.open(rawTextInput, encoding='utf-8', mode='r'):
            result = jieba.tokenize(line.strip())
            newline = []
            for tk in result:
                tk = tk[0]
                if tk == ' ':
                    newline.append(' ')
                    continue
                if tk in punctuations:
                    newline.append(tk)
                    continue
                tk = ''.join([i for i in tk if not i.isdigit()]).lower()
                if len(tk) == 0:
                    newline.append(' ')
                    continue
                if tk not in mapping:
                    mapping[tk] = id2alpha(id)
                    id += 1
                newline.append(mapping[tk])
                newline.append(' ')
            output.write(u''.join(newline).encode('utf8'))
            output.write('\n')
    with codecs.open(mappingFile, encoding='utf-8', mode='w') as output:
        for (string, token) in mapping.iteritems():
            output.write(token.encode('utf8'))
            output.write(',')
            output.write(string)
            output.write('\n')
Beispiel #52
0
def handlemsg(data):
    oper = loadsjson(data)
    if oper[0] == 'c2m':
        return dumpsjson(mc.c2m.translate(*oper[1:]))
    elif oper[0] == 'm2c':
        return dumpsjson(mc.m2c.translate(*oper[1:]))
    elif oper[0] == 'c2m.raw':
        return dumpsjson(mc.c2m.rawtranslate(oper[1]))
    elif oper[0] == 'm2c.raw':
        return dumpsjson(mc.m2c.rawtranslate(oper[1]))
    elif oper[0] == 'modelname':
        return dumpsjson(mc.name())
    elif oper[0] == 'cut':
        return dumpsjson(tuple(jieba.cut(*oper[1], **oper[2])))
    elif oper[0] == 'cut_for_search':
        return dumpsjson(tuple(jieba.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'tokenize':
        return dumpsjson(tuple(jieba.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut':
        return dumpsjson(tuple(jiebazhc.cut(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut_for_search':
        return dumpsjson(
            tuple(jiebazhc.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.tokenize':
        return dumpsjson(tuple(jiebazhc.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'
    else:
        return dumpsjson('Command not found')
Beispiel #53
0
虽说房价居高不下让许多受访者观望,但是打算今年尽快出手的受访者也不少。“最近去售楼处咨询发现年前的优惠减少了不少,按照这趋势,开发商很有可能涨价,再等估计就更买不起。”黄小姐告诉记者,厦门气候宜居、房源供应不足又深受异地购房者的青睐,房价下跌的可能性非常小。加上今年是落户厦门的最后机会,为了赶上落户的“末班车”,还是尽早稳妥。
"""

# jieba.analyse.set_stop_words("stop_words.txt")
#
# seg_res = jieba.cut(raw_text)  # 默认是精确模式
#
# seg_list = list(seg_res)
#
# print(seg_list)
#
# seg_freq_counter = Counter(seg_list)
#
#
# print(seg_freq_counter)

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
words = jieba.tokenize(raw_text)


seg_list = []
for (w, start_pos, stop_pos) in words:
    if not accepted_chars.match(w) and len(w) <= 1:
        continue

    # @todo 可以使用 isdigit 判断是否为数字

    seg_list.append(w)

seg_freq_counter = Counter(seg_list)
print(seg_freq_counter)
Beispiel #54
0
def cuttest(test_sent):
    global g_mode
    test_sent = test_sent.decode('utf-8')
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
Beispiel #55
0
fileList = []
i = 0

for filename in filenameList:
    basename = os.path.basename(filename)
    reader = open(filename)
    xmldict = xmltodict.parse(reader.read())
    reader.close()
    if xmldict['document']['header']['resultId'] == 'R0000':
        try:
            seqid = xmldict['document']['header']['sequenceId']
            filedate = xmldict['document']['header']['timestamp'][0:8]
            article = xmldict['document']['articles']['article']
            titledata = xmldict['document']['basicinfo']['title']
            postdata = article['topicinfo']['postdata']
            dictCutTitle = jiebaToList(jieba.tokenize(titledata,mode='default'))
            dictCutData = jiebaToList(jieba.tokenize(postdata,mode='default'))
            posttime = xmldict['document']['articles']['article']['topicinfo']['posttime']
            tmpDir = XML_DEST_DIR + filedate
            cutResult = {'result':'1','posttime':posttime,'cuttitle':dictCutTitle,'cutdata':dictCutData}
            dictConceptStat = {}
            listConceptWord = []
            for word in dictCutTitle:
                if dictConceptStat.has_key(word['word']):
                    	dictConceptStat[word['word']] = dictConceptStat[word['word']] + 2
                else:
                    	if dictConceptList.has_key(word['word']):
                        	dictConceptStat[word['word']] = 2
            
            for word in dictCutData:
                if dictConceptStat.has_key(word['word']):
#remove white space, convert to lower letters and remove nonalpha characters
comments = []
for comment in array:
    lower = comment.lower()
    comments.append(''.join([i for i in lower if i.isalpha()]))

#retrieve mentioned Chinese terms associated with each brand from all texts
mk = ['michaelkors', 'mk']
ks = ['katespade', 'ks']
mk_dict = {}
ks_dict = {}
#tokenize text in each weibo post
for sentence in comments:
    #create a counter object to count the occurrence of each term in texts
    c = Counter()
    result = jieba.tokenize(sentence)
    #create a list to store tokenized terms and their frequencies
    word = []
    for tk in result:
        #print "word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2])
        word.append(tk[0])
    #update the counter object with new terms
    c.update(word)

    #find the number of co-occurrences with mk or ks for every token in all texts
    for key in c.keys():
        for word, count in c.most_common(10):
            if any(brand in key for brand in mk):
                mk_dict[word] =  count
            elif any(brand in key for brand in ks):
                ks_dict[word] = count
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
Beispiel #58
0
#jieba.enable_parallel(4) 

seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
print "Full Mode:", "/ ".join(seg_list) #全模式

seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
print "Default Mode:", "/ ".join(seg_list) #精确模式

seg_list = jieba.cut("他来到了网易杭研大厦") #默认是精确模式
print ", ".join(seg_list)

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
print ", ".join(seg_list)

seg_list = jieba.cut_for_search("李小福是创新办主任也是云计算方面的专家")
print ", ".join(seg_list)

# load customer dict file
jieba.load_userdict("G:\GitHub\MyRepository\Python\TestFiles\user_dict.txt")
seg_list = jieba.cut_for_search("李小福是创新办主任也是云计算方面的专家")
print ", ".join(seg_list)

import jieba.posseg as pseg
words =pseg.cut("我爱北京天安门")
for w in words:
    print w.word,w.flag

result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
Beispiel #59
0
 def tokenize(self, sentence, mode='default', HMM=True):
     sentence = to_text(sentence)
     tokens = jieba.tokenize(sentence, mode=mode, HMM=HMM)
     return list(tokens)
Beispiel #60
0
def tokenize(l):
    words=[]
    for s in l:
        words.append(jieba.tokenize(s))
    return words