Example #1
0
 def segment(self, cleaned_sentances: list, use_hmm: bool = False) -> list:
     jieba.enable_parallel(self.num_worker)
     cleaned_sentances = [
         ' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances
     ]
     jieba.disable_parallel()
     return cleaned_sentances
Example #2
0
def main(data_dir, file_dict, surfix, dry_run_dict):
    encoder_path = '{}/{}_encoder_source.txt'.format(data_dir, surfix)
    decoder_path = '{}/{}_decoder_source.txt'.format(data_dir, surfix)

    source_sentences, target_sentences = merge_blanks(
        os.path.join(data_dir, file_dict['source']),
        os.path.join(data_dir, file_dict['target']))

    print('String Preprocessing')
    source_sentences = str_utils_en.text_cleaning(source_sentences)
    target_sentences = str_utils_ch.text_cleaning(target_sentences)
    print('Double check source={}, target={}'.format(len(source_sentences),
                                                     len(target_sentences)))

    print('Word segmentation')
    jieba.initialize()
    jieba.disable_parallel()
    with ProcessingPool(nodes=min(os.cpu_count(), 5)) as pool:
        source_sentences = pool.map(
            lambda x:
            [i.strip() for i in x.strip().lower().split(' ') if len(i) >= 1],
            source_sentences)
    with ProcessingPool(nodes=min(os.cpu_count(), 5)) as pool:
        target_sentences = pool.map(
            lambda x: [
                i.strip() for i in jieba.cut(x.strip(), cut_all=False)
                if len(i) >= 1
            ], target_sentences)
    print('Triple check source={}, target={}'.format(len(source_sentences),
                                                     len(target_sentences)))

    source_sentences, target_sentences = filter_sample(source_sentences,
                                                       target_sentences)
    print('Triple check source={}, target={}'.format(len(source_sentences),
                                                     len(target_sentences)))
    print(
        'Writing pair into encoder and decoder source at {}'.format(data_dir))
    with open(encoder_path, 'w',
              encoding='utf-8') as fe, open(decoder_path,
                                            'w',
                                            encoding='utf-8') as fd:
        for encoder_source, decoder_source in zip(source_sentences,
                                                  target_sentences):
            fe.write(' '.join(encoder_source).lower())
            fe.write('\n')
            fd.write(' '.join(decoder_source).lower())
            fd.write('\n')

    # better sub tokenizer can be used to generate dictionary
    dump_dictionary(data_dir,
                    source_sentences,
                    prefix='source',
                    debug=True,
                    dry_run=dry_run_dict)
    dump_dictionary(data_dir,
                    target_sentences,
                    prefix='target',
                    debug=True,
                    dry_run=dry_run_dict)
Example #3
0
def cut_word(sentence, parallel=False, processnum=2):
    if parallel:
        # 开启并行分词模式,参数为并行进程数,不支持windows
        jieba.enable_parallel(processnum=processnum)
        word_list = jieba.lcut(sentence, cut_all=False, HMM=True)
        # 关闭并行分词模式
        jieba.disable_parallel()
    else:
        word_list = jieba.lcut(sentence, cut_all=False, HMM=True)
    return word_list
Example #4
0
def text_processing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # 遍历文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        files = os.listdir(new_folder_path)
        # 读取文件
        j = 1
        for file in files:
            if j > 100:  # 怕内存爆掉,只取100个样本文件,你可以注释掉取完
                break
            with open(os.path.join(new_folder_path, file), 'r') as fp:
                raw = fp.read()
            ## 是的,随处可见的jieba中文分词
            jieba.enable_parallel(4)  # 开启并行分词模式,参数为并行进程数,不支持windows
            word_cut = jieba.cut(raw,
                                 cut_all=False)  # 精确模式,返回的结构是一个可迭代的genertor
            word_list = list(word_cut)  # genertor转化为list,每个词unicode格式
            jieba.disable_parallel()  # 关闭并行分词模式

            data_list.append(word_list)  #训练集list
            class_list.append(folder.decode('utf-8'))  #类别
            j += 1

    ## 粗暴地划分训练集和测试集
    data_class_list = zip(data_list, class_list)
    random.shuffle(data_class_list)
    index = int(len(data_class_list) * test_size) + 1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)

    #其实可以用sklearn自带的部分做
    #train_data_list, test_data_list, train_class_list, test_class_list = sklearn.cross_validation.train_test_split(data_list, class_list, test_size=test_size)

    # 统计词频放入all_words_dict
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            if all_words_dict.has_key(word):
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1

    # key函数利用词频进行降序排序
    all_words_tuple_list = sorted(all_words_dict.items(),
                                  key=lambda f: f[1],
                                  reverse=True)  # 内建函数sorted参数需为list
    all_words_list = list(zip(*all_words_tuple_list)[0])

    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
Example #5
0
    def MakeSentences(self):

        sentences = []
        files = self.getFilePathList()
        jieba.enable_parallel(8)
        for file in files:
            print(file)
            sentences += self.readFile(file)
        jieba.disable_parallel()

        return sentences
Example #6
0
def DT(DT):
    results = session.query.filter(
        or_(session.documenttype == DT, session.region.like('%' + DT + '%'),
            session.court.like('%' + DT + '%'))).all()
    if not results:
        return render_template('worldcloudStatistics.html')
    # 定义一个字符串,保存关键字
    keyword = ""
    for res in results:
        keyword += str(res.keyword)
    # 分词
    fe = '|'.join(jieba.cut(keyword))
    santi_words = [x for x in jieba.cut(fe) if len(x) >= 0]
    jieba.disable_parallel()
    # 提取关键字
    c = Counter(santi_words).most_common(1000)
    keys = ""
    for word in c:
        if word[0].isdigit():
            del word
        else:
            keys += str(word)
    # 词云制作
    font = 'app/static/HYQiHei-55J.ttf'  # 选择字体路径,这里使用了黑体G:/pythonWeb/web/005.jpg
    color_mask = plt.imread(
        "app/static/keyword/china.jpg")  # 读取模板图片,这里使用了一张五角星图片
    cloud = WordCloud(font_path=font,
                      background_color='white',
                      mask=color_mask,
                      max_words=200,
                      max_font_size=200,
                      width=3000,
                      height=3000,
                      random_state=42)  # 设置词云参数,字体,模板,背景白色,最大词量100个,最大字体尺寸100
    # word_cloud = cloud.generate(fe)
    cloud.generate(fe)
    # 基于彩色图像生成相应彩色
    image_colors = ImageColorGenerator(color_mask)
    plt.imshow(cloud)
    # 关闭坐标轴
    plt.axis('off')
    # 绘制词云
    plt.figure()
    plt.imshow(cloud.recolor(color_func=image_colors))
    plt.axis('off')
    # 保存图片
    word_cloud2 = cloud.generate(str(keys))  # 产生词云数据 word_cloud
    # wcould="分词词云_"
    wcould2 = "cloud"
    img = wcould2 + ".jpg"
    l = 'app/static/keyword/'
    word_cloud2.to_file(l + '/' + img)
    cloud.to_file(l + '/' + 'cloudword.png')
    return render_template('worldcloudStatistics.html', val1=time.time())
Example #7
0
def get_cut_word_cixing(arg, parallel=False, num=1):
    if not parallel:
        s = arg
        res = psg.cut(s)
        return {x.word:x.flag for x in res}
    else:
        filename = arg
        s = open(filename).read()
        jieba.enable_parallel(parallel)
        res = psg.cut(s)
        jieba.disable_parallel()
        return {x.word:x.flag for x in res}
Example #8
0
 def _tokenize(self, sentence, cut_all, cut_for_search, HMM, enable):
     if enable[0]:
         print('use multiprocessing')
         jieba.enable_parallel(enable[1])
     else:
         jieba.disable_parallel()
     if not cut_for_search:
         sentence_temp = ' '.join(jieba.cut(sentence, cut_all, HMM))
         return sentence_temp
     else:
         sentence_temp = ' '.join(jieba.cut_for_search(sentence, HMM))
         return sentence_temp
Example #9
0
def text_processing(folder_path, test_size=0.2):  # test_size to divide set
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # traverse all folders
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        files = os.listdir(new_folder_path)
        # read files
        files_number = 1
        for file in files:
            if files_number > 100:  # avoid break memory, only sample 100 files
                break
            with open(os.path.join(folder_path, file), 'r') as fp:
                raw = fp.read()
            jieba.enable_parallel(4)  # Parallel processing is 4
            word_cut = jieba.cut(raw, cut_all=False)  # exact mode
            word_list = list(
                word_cut
            )  # generator turn to list, every word's format is unicode
            jieba.disable_parallel()  # close the parallel mode
            data_list.append(word_list)  # train set list
            class_list.append(folder.decode('utf-8'))  # genre
            files_number += 1

    # divide train set and test set (also could use sklearn to divide)
    data_class_list = zip(data_list, class_list)
    random.shuffle(data_class_list)
    index = int(len(data_class_list) * test_size) + 1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)

    # statistic words frequency in all_words_list
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            if all_words_dict.has_key(word):
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1

    # key function use words frequency by descending order
    # internal function sorted has to be list
    all_words_tuple_list = sorted(all_words_dict.items(),
                                  key=lambda f: f[1],
                                  reverse=True)
    all_words_list = list(zip(*all_words_tuple_list[0]))

    return all_words_list, train_data_list, test_data_list, test_data_list, train_class_list, test_class_list
Example #10
0
def DT(DT):
      Court = session.query.with_entities(session.court).distinct().all()
      Region = session.query.with_entities(session.region).distinct().all()
      document_Type = session.query.with_entities(session.document_Type).distinct().all()
      results = session.query.filter(DT==session.document_Type).all()
      if not results:
            return render_template('词云统计.html',court=Court,region=Region,document_Type=document_Type)
      num=random.randint(0,10000)
      H='关键字/'
      char_txt=H+str(num)+".txt"

      for row in results[0].keyword:
            fname1=row[0]
            #print(fname1)    
            m=open(char_txt,'a')
            m.write(str(fname1))
      santi_text = open(char_txt,'rb').read()
      fe='|'.join(jieba.cut(santi_text))
      santi_words = [x for x in jieba.cut(fe) if len(x) >= 2]
      jieba.disable_parallel()
      c = Counter(santi_words).most_common(1000)
      f=open(char_txt,"w")
      word=['']
      for word in c:
            if word[0].isdigit():
                  del word
            else:
                  f.write(str(word))
      f.close()
      

      m=open(char_txt,'r').read()
      #词云制作

      font='HYQiHei-55J.ttf'                  #选择字体路径,这里使用了黑体
      color_mask = imread("photo.jpg")        #读取模板图片,这里使用了一张五角星图片
      cloud = WordCloud(font_path=font,background_color='white',mask=color_mask,max_words=100,max_font_size=50,width=5000,height=5000)#设置词云参数,字体,模板,背景白色,最大词量100个,最大字体尺寸100
      #word_cloud = cloud.generate(fe)  
      word_cloud2=cloud.generate(str(m))              # 产生词云数据 word_cloud
      # wcould="分词词云_"
      # cy=wcould+str(num)+".jpg"
      # word_cloud.to_file(cy)                                           #词云保存为图片w_cloud.jpg
      #print ("词云成功...")
      wcould2="词云"
      img=wcould2+".jpg"
      l='static/keyword'
      word_cloud2.to_file(l+'/'+img)
      return render_template('词云统计.html',court=Court,region=Region,document_Type=document_Type,val1=time.time())
Example #11
0
    def getOneSong(self, lyric, id):

        try:
            # 先改变状态到生成歌曲中, 锁住
            self.dbManager.execute(
                "update rap_music163 set status = 2 where status = 1 and id = '"
                + str(id) + "'")

            # 结巴分词
            print len(lyric)

            # 打开并行
            # jieba.enable_parallel(4)
            # 关闭并行
            jieba.disable_parallel()

            words = [x for x in jieba.cut(lyric) if len(x) >= 2]
            jieba.disable_parallel()
            from collections import Counter
            count = Counter(words).most_common(20)
            print count

            for vo in count:
                word = vo[0]
                number = vo[1]
                # 自增有序集合内value对应的分数
                self.r.zincrby(self.sortedSetKey, word,
                               number)  # 自增zset_name对应的有序集合里a1对应的分数

            print self.r.zcard(self.sortedSetKey)

            # # 获取关键词
            # tags = jieba.analyse.extract_tags(lyric, topK=3)
            # print u"关键词:"
            # print " ".join(tags)

            # 循环每个词,数据库里确认是插入还是更新   redis更好

            self.dbManager.execute(
                "update rap_music163 set status = 3 where status = 2 and id = '"
                + str(id) + "'")

        except Exception as err:
            # 打印异常堆栈
            exstr = traceback.format_exc()
            print exstr
            c.Log('{} : {}'.format("Error 901", err))
Example #12
0
def main():
    xgkData = [
        w.strip()
        for w in codecs.open('xgk_seg.txt', 'r', encoding='utf-8').readlines()
    ]
    model = gensim.models.KeyedVectors.load_word2vec_format('wiki_text.vector',
                                                            binary=False)
    # print(xgkData[0])
    jieba.enable_parallel(2)
    for index in range(len(xgkData)):
        words = xgkData[index]
        words = jieba.cut(words)
        words = list(set(words))
        wordvecs = getWordVecs(words, model)
        data_vecs = pd.DataFrame(wordvecs)
        data_vecs.to_csv('vec/xgk_vec_' + str(index) + '.csv', index=False)
    jieba.disable_parallel()
Example #13
0
	def _cut_words(self, fromCache=True):
		if fromCache:
			wordFrags = pkl_load("wordFrags.pkl")
		else:
			wordFragsList = list()
			with DataBase() as db:
				newsID, newsData = db.get_news()
			jieba.enable_parallel(4)
			for news in show_status(newsData,"cut words"):
				frags = jieba.cut(news, cut_all=False)
				words = [frag for frag in frags if (frag not in self.stopWords) \
							and (not frag.isspace() and (not frag.isdigit()))]
				wordFragsList.append(words)
			jieba.disable_parallel()
			wordFrags = dict(zip(newsID, wordFragsList))
			pkl_dump("wordFrags.pkl")
		return wordFrags
Example #14
0
def text_processing(folder_path,test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # 遍历文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path,folder)
        files = os.listdir(new_folder_path)
        # 读取文件
        for file in files:
            with open(os.path.join(new_folder_path,file),'r') as fp:
                raw = fp.read()
            raw = raw.strip()
            # jieba并行模式
            jieba.enable_parallel(4)
            word_list = jieba.lcut(raw)
            jieba.disable_parallel()

            data_list.append(word_list)
            class_list.append(folder)

    # 简单的划分训练集和测试集
    data_class_list = list(zip(data_list,class_list))
    # 对列表进行随机打散
    random.shuffle(data_class_list)
    index = int(len(data_class_list)*test_size)+1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list,train_class_list = list(zip(*train_list))
    test_data_list,test_class_list = list(zip(*test_list))

    # 统计所有单词的词频
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            all_words_dict.setdefault(word,0)
            all_words_dict[word] += 1

    # 根据词频对单词进行降序排列
    all_words_tuple_list = sorted(all_words_dict.items(),key=lambda f:f[1],reverse=True)
    all_words_list = list(list(zip(*all_words_tuple_list))[0])

    return all_words_list,train_data_list,test_data_list,train_class_list,test_class_list
Example #15
0
def text_processing(folder_path, test_size=0.2):
    """
    :param folder_path:
    :param test_size:
    :return: text processing
    """
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        files = os.listdir(new_folder_path)
        for file in files:
            with open(os.path.join(new_folder_path, file), 'r') as fp:
                raw = fp.read()
            # try:
            #     with codecs.open(os.path.join(new_folder_path,file), 'r', 'GB18030') as fp:
            #         raw = fp.read()
            # except UnicodeDecodeError:
            #     pass
            jieba.enable_parallel(2)
            word_cut = jieba.cut(raw, cut_all=False)
            word_list = list(word_cut)
            #print(word_list)
            jieba.disable_parallel()
            data_list.append(word_list)
            class_list.append(folder)

    train_data_list, test_data_list, train_class_list, test_class_list = train_test_split(
        data_list, class_list, test_size=test_size)
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict:
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1
    all_words_tuple_list = sorted(all_words_dict.items(),
                                  key=lambda f: f[1],
                                  reverse=True)
    all_words_list = list(zip(*all_words_tuple_list))[0]
    #return all_words_list
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
Example #16
0
 def count_words(self, lwords):
     """
     :param lwords:
     :return: dwords type is dict ; key :word#flag value :freq
     """
     jieba.enable_parallel(10)  # start many processes
     word_flags = []  # {word#flag : freq},{word:sex}
     for context in lwords:
         for sub in norm_seg(context):
             w = sub.word
             if self.oGWF.isGeneralWord(w.encode('utf-8')) or w.strip() == '':
                 continue
             if len(w) >= int(self.word_length):
                 key = '%s#%s' % (w, sub.flag)
                 word_flags.append(key)
     logger.info('count is starting')
     jieba.disable_parallel()
     dwords = Counter(word_flags)
     return dwords
Example #17
0
def test_key_words_with_jieba(type='jieba'):
    with open(
            '/Users/zhaowei/Desktop/八爪鱼/yeyonglong_enterprise_name/names2.txt'
    ) as f:
        a = f.readline()
    print(a)

    key_words = []
    jieba.enable_parallel(2)
    if type == 'jieba':
        key_words = [x for x in jieba.cut(a) if len(x) > 1]
    elif type == 'jieba_fast':
        key_words = [x for x in jieba_fast.cut(a) if len(x) > 1]
    print(key_words)
    jieba.disable_parallel()

    # 获取高频词
    num = 20
    most_words = Counter(key_words).most_common(num)
    print('高频词汇{}:{}'.format(str(num), most_words))
def cut_word(sentence, parallel=False, processnum=2):
    if parallel:
        # 开启并行分词模式,参数为并行进程数,不支持windows
        jieba.enable_parallel(processnum=processnum)
        word_list = jieba.lcut(sentence, cut_all=False, HMM=True)
        # 关闭并行分词模式
        jieba.disable_parallel()
    else:
        word_list = jieba.lcut(sentence, cut_all=False, HMM=True)
    # 去除停用词
    stopwords = [
        line.strip() for line in open(file='../resource/ChineseStopwords.txt',
                                      mode='r',
                                      encoding='UTF-8').readlines()
    ]
    new_word_list = []
    for word in word_list:
        if not stopwords.__contains__(word):
            new_word_list.append(word)
    return new_word_list
Example #19
0
def text_processing(folder_path, test_size=0.2):  #test_size也就是训练集占80%,测试集占20%
    folder_list = os.listdir(folder_path)  #该路径下的所有文件和文件夹
    data_list = []  # 所有单词
    class_list = []  # 类别,因为一个文本file属于一个folder,我们把folder名字直接作为类别就好

    #遍历文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(
            folder_path, folder
        )  # join就是将两个拼接起来,比如folder_path为".", folder为"hello", join后就是"./hello"
        files = os.listdir(new_folder_path)
        #读取文件
        j = 1
        for file in files:
            if j > 100:  #怕内存爆炸,只取100个样本文件,你可以注释掉取完
                break
            with open(os.path.join(new_folder_path, file), 'r') as f:
                raw = f.read()

            #拿到的内容用jieba进行分词
            jieba.enable_parallel(4)  #开启并行分词模式,参数为线程个数,不支持windows
            word_cut = jieba.cut(
                raw, cut_all=False
            )  #cut_all为True就是全模式,为False就是精确模式,默认是精确模式。全模式就是"清华大学 华大",精确模式就只有一个"清华大学"
            #以上cut返回可迭代的generator
            word_list = list(word_cut)  #generator转成list,每个次unicode格式
            jieba.disable_parallel()  # 关闭并行分词模式

            data_list.append(word_list)
            class_list.append(folder.decode('utf-8'))  #类别
            j += 1

        # 粗暴的划分训练集和测试集
        data_class_list = zip(data_list, class_list)
        random.shuffle(data_class_list)
        index = int(len(data_class_list) * test_size) + 1
        train_list = data_class_list[index:]
        test_list = data_class_list[:index]
        train_data_list, train_class_list = zip(*train_list)
        test_data_list, test_class_list = zip(*test_list)
Example #20
0
def jieba_config(userdict=None,
                 config=None,
                 wordlist=None,
                 parallel=False,
                 p=0):
    """
    Use load_userdict() to load your dict or use add_word add a word list or delete a word
    :param userdict: A list contains filename
    :param config: 'A' or 'D',add_word or del_word
    :param wordlist: A list contains specify word
    :param parallel: Configurate to enable multiprocessing
    :param p: Process number
    :return: A string
    """
    if userdict:
        for file in userdict:
            load_userdict(file)
    if config == 'A':
        if wordlist:
            for word in wordlist:
                add_word(word)
        else:
            return 'Wordlist require'
    elif config == 'D':
        if wordlist:
            for word in wordlist:
                del_word(word)
        else:
            return 'Wordlist require'
    else:
        return 'Invalid config content'
    if parallel:
        if p >= 0:
            enable_parallel(p)
        else:
            return 'Invalid p content'
    elif not parallel:
        disable_parallel()
    else:
        return 'Invalid parallel content'
    def process_text(self, test_size: int = 0.2):
        """处理语料库中的文本信息

        Args:
            test_size (float): 测试集占比

        Returns:
            sorted_words (List): 按照词频从大到小排列的单词列表
            train_words_list (List): 训练文本列表
            test_words_list (List): 测试文本列表
            train_class_list (List): 训练类别列表
            test_class_list (List): 测试类别列表
        """
        words_list, class_list = [], []
        for folder in os.listdir(self.folder_path):
            for text_file in os.listdir(self.folder_path / folder):
                file_path = self.folder_path / folder / text_file
                with open(file_path) as f:
                    content = f.read()

                jieba.enable_parallel(4)  # 开启并行分词
                segs = jieba.lcut(content, cut_all=False)  # 精确模式分词
                jieba.disable_parallel()

                words_list.append(segs)
                class_list.append(folder)

        # 划分训练集与测试集
        train_words_list, test_words_list, train_class_list, test_class_list = train_test_split(
            words_list, class_list, test_size=test_size, random_state=0)
        # 统计词频
        word_count = Counter()
        for words in train_words_list:
            word_count.update(words)
        # 将单词按照词频从大到小排序
        sorted_words = sorted(word_count.keys(),
                              key=lambda x: word_count[x],
                              reverse=True)
        return sorted_words, train_words_list, test_words_list, train_class_list, test_class_list
Example #22
0
    def parse_article(self, response):
        item = ArticleItem()
        for key in list(parser_config['all_spider'].keys()):
            try:
                item[key] = response.xpath(parser_config['all_spider'][key]).extract()[0].encode('utf-8') if len(
                    response.xpath(parser_config['all_spider'][key]).extract()) > 0 else ''
            except:
                traceback.print_exc()
        for key in list(parser_config[self.name].keys()):
            try:
                item[key] = response.xpath(parser_config[self.name][key]).extract()[0].encode('utf-8') if len(
                    response.xpath(parser_config[self.name][key]).extract()) > 0 else ''
            except:
                traceback.print_exc()
        # url无法分辨的时候使用
        if item['content_original'] == '':
            self.log('*** not article url for %s' % response._url.encode('utf-8'))
            return
        item['fromURL'] = response._url.encode('utf-8')
        item['creat_date'] = time.strftime("%Y/%m/%d %H:%M:%S")
        item['content_clear'] = del_html_attr(item['content_original']).encode('utf-8')
        item['lenth'] = len(item['content_clear'].replace(' ', ''))

        jieba.enable_parallel(20)
        cn_str = get_CN_str(item['content_clear'])
        words = [x.encode('utf-8') for x in jieba.cut_for_search(cn_str)]
        article_keywords = [x for x in words if len(x) >= len('标签')]
        article_descr = [x for x in words if len(x) >= len('分词短语')]
        article_note = [x for x in words if len(x) >= len('分词文章摘要')]
        jieba.disable_parallel()

        article_keywords = Counter(article_keywords).most_common(20)
        article_descr = Counter(article_descr).most_common(10)
        article_note = Counter(article_note).most_common(5)
        item['keywords_by_app'] = ','.join([c[0] for c in article_keywords])
        item['descr_by_app'] = ','.join([c[0] for c in article_descr])
        item['note_by_app'] = ','.join([c[0] for c in article_note])
        return item
Example #23
0
import jieba
# import jieba.posseg as pseg
import re
from operator import itemgetter, attrgetter, methodcaller
import time

jieba.enable_parallel(4)  # 开启并行分词模式,参数为并发执行的进程数

content = open(
    'iter.txt',
    'rb').read()  # GuiZhou reports as the input. both finding and diagnosis.

start_time = time.time()
jieba.load_userdict("coronary_dict.txt")
words = jieba.lcut(content)  # words = pseg.lcut(content)  ## 默认是精确模式

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
print(elapsed_time)
jieba.disable_parallel()  # 关闭并行分词模式

# words_sort = sorted(words, key=attrgetter('flag'))
# words_set = sorted(set(words), key=attrgetter('flag'))
words_set = set(words)
for word in words_set:  # in words_all:
    m_number = re.search(r"(\d*\.\d+|\d+)+", word)
    if m_number is None:
        print(word)
        # print (word.word, word.flag)
Example #24
0
def testcase():
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
cuttest("我不喜欢日本和服。")
cuttest("雷猴回归人间。")
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
cuttest("我需要廉租房")
cuttest("永和服装饰品有限公司")
cuttest("我爱北京天安门")
cuttest("abc")
cuttest("隐马尔可夫")
cuttest("雷猴是个好网站")

if __name__ == "__main__":
testcase()
jieba.set_dictionary("foobar.txt")
print "================================"
testcase()


jieba分词
1分词

jieba.cut 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型
jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8
jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用
jieba.lcut 以及 jieba.lcut_for_search 直接返回 list
jieba.Tokenizer(dictionary=DEFAULT_DICT) 新建自定义分词器,可用于同时使用不同词典。jieba.dt 为默认分词器,所有全局分词相关函数都是该分词器的映射。
复制代码
# encoding=utf-8
import jieba

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))
复制代码
 

2添加自定义辞典

载入辞典

开发者可以指定自己自定义的词典,以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力,但是自行添加新词可以保证更高的正确率
用法: jieba.load_userdict(file_name) # file_name 为文件类对象或自定义词典的路径
词典格式和 dict.txt 一样,一个词占一行;每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒。file_name 若为路径或二进制方式打开的文件,则文件必须为 UTF-8 编码。
词频省略时使用自动计算的能保证分出该词的词频。
调整辞典

使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典。
使用 suggest_freq(segment, tune=True) 可调节单个词语的词频,使其能(或不能)被分出来。

注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。

复制代码
>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
如果/放到/post/中将/出错/。
>>> jieba.suggest_freq(('中', '将'), True)
494
>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
如果/放到/post/中/将/出错/。
>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
「/台/中/」/正确/应该/不会/被/切开
>>> jieba.suggest_freq('台中', True)
69
>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
「/台中/」/正确/应该/不会/被/切开
复制代码
 

3关键词提取

基于 TF-IDF 算法的关键词抽取
import jieba.analyse

jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())
sentence 为待提取的文本
topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20
withWeight 为是否一并返回关键词权重值,默认值为 False
allowPOS 仅包括指定词性的词,默认值为空,即不筛选
jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 实例,idf_path 为 IDF 频率文件
基于 TextRank 算法的关键词抽取
jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 直接使用,接口相同,注意默认过滤词性。
jieba.analyse.TextRank() 新建自定义 TextRank 实例
算法论文: TextRank: Bringing Order into Texts

基本思想:
将待抽取关键词的文本进行分词
以固定窗口大小(默认为5,通过span属性调整),词之间的共现关系,构建图
计算图中节点的PageRank,注意是无向带权图
4. 词性标注
jieba.posseg.POSTokenizer(tokenizer=None) 新建自定义分词器,tokenizer 参数可指定内部使用的jieba.Tokenizer 分词器。jieba.posseg.dt 为默认词性标注分词器。
标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。
用法示例
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for word, flag in words:
...    print('%s %s' % (word, flag))
5. 并行分词
原理:将目标文本按行分隔后,把各行文本分配到多个 Python 进程并行分词,然后归并结果,从而获得分词速度的可观提升
基于 python 自带的 multiprocessing 模块,目前暂不支持 Windows
用法:

jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数
jieba.disable_parallel() # 关闭并行分词模式
6. Tokenize:返回词语在原文的起止位置
注意,输入参数只接受 unicode
默认模式
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
word 永和                start: 0                end:2
word 服装                start: 2                end:4
word 饰品                start: 4                end:6
word 有限公司            start: 6                end:10

搜索模式
result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')
for tk in result:
    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
word 永和                start: 0                end:2
word 服装                start: 2                end:4
word 饰品                start: 4                end:6
word 有限                start: 6                end:8
word 公司                start: 8                end:10
word 有限公司            start: 6                end:10
 
Example #25
0
 def disable_parallel():
     jieba.disable_parallel()
Example #26
0
print '获取词性----------------------------'
import jieba.posseg as psg
# print [(x.word,x.flag) for x in psg.cut(s)]
for x in psg.cut(s):
    print x.word + " " + x.flag + ",",
print '\n只获取名词--------------------------'
# print [(x.word,x.flag) for x in psg.cut(s) if x.flag.startswith('n')]
for x in psg.cut(s):
    if x.flag.startswith('n'): print x.word + " " + x.flag + ",",

print ''
#并行分词
# 开启并行分词模式,参数为并发执行的进程数
jieba.enable_parallel(5)

santi_text = open('./santi.txt').read()
print len(santi_text)
santi_words = [x for x in jieba.cut(santi_text) if len(x) >= 2]

# 关闭并行分词模式
jieba.disable_parallel()

#获取出现频率Top n的词:还是以上面的三体全集文本为例,假如想要获取分词结果中出现频率前20的词列表,可以这样获取:
from collections import Counter
c = Counter(santi_words).most_common(20)
print type(c)
for each in c:
    print each[0] + u'' + str(each[1]) + ',',
print ''
print c
Example #27
0
def fun4():
    # 关闭
    jieba.disable_parallel()
    # 开启
    jieba.enable_parallel(4)
Example #28
0
#==========================================================
#jieba participle
#parallel
jieba.enable_parallel(1) #start:the paralleled num of processes;but one is better.
jieba.set_dictionary("dict_for_jieba.txt") #set dictionary dir

trainData = []
for s in train_file["String"]:
    trainData.append("/".join(jieba.cut(s)))

print len(trainData)
print trainData[0]


jieba.disable_parallel() #turn off processes

#============================================================
#TF-IDF :Extract features.



#train_data = np.array(trainData["String"],dtype=np.float64)
train_target = np.array(train_file["Value"],dtype=np.float64)
#print train_target
# print train_data[1]
# print train_target[1]
#
# test_data = np.array(test_file["String"],dtype=np.float64)
# # print test_data[0]
# #=======================================================
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
...    print w.word, w.flag
...
我 r
爱 v
北京 ns
天安门 ns
功能 5) : 并行分词

原理:将目标文本按行分隔后,把各行文本分配到多个python进程并行分词,然后归并结果,从而获得分词速度的可观提升 
基于python自带的multiprocessing模块,目前暂不支持windows 
用法:

jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数
jieba.disable_parallel() # 关闭并行分词模式
例子:
import urllib2
import sys,time
import sys
sys.path.append("../../")
import jieba
jieba.enable_parallel(4)

url = sys.argv[1]
content = open(url,"rb").read()
t1 = time.time()
words = list(jieba.cut(content))

t2 = time.time()
tm_cost = t2-t1
Example #30
0
def text_processing(file_path):
    mentality_list_num = {"happy_love": 0, "sad_guilty": 0, "angry_hatred": 0, "surprise_afraid": 0, "other_emotion":0}
    positive_num = 0
    negative_num = 0
    neutral_num = 0
    # 获取档案里面需要特征词(关键词)的数量,用于IDF的计算
    words_dict_in_total = {}
    # 获取档案里面所有评论的数量、之后用于IDF的计算中
    comments_in_total = get_num_of_comments_in_total(file_path)
    # 获取所有档案里所有的评论,用于IDF的计算中
    comments = get_total_comments_list(file_path)

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read()
    # 数据的格式化处理,将每则新闻分开
    lines = re.sub(r"\d+-\d+-\d+", "", lines)
    lines = lines.replace(' ', "")
    lines = lines.split("******")
    for line in lines:
        # 分离出评论区,之后都是对于评论区做讨论
        line = line.split("**评论区**")
        try:
            # 将评论区格式"*评论*d+"消除
            line[1] = re.sub("\*\w+\*\d+:", "", line[1])
            # 使用jieba分词软件对于对于文本进行词语切分
            # 开启并行分词模式,参数为并行进程数,
            # 精确模式,返回的结构是一个可迭代的genertor,
            # genertor转化为list,每个词unicode格式,
            # 关闭并行分词模式
            jieba.enable_parallel(4)
            word_cut = jieba.cut(line[1], cut_all=False)
            word_list = list(word_cut)
            jieba.disable_parallel()

            all_words_dict = {}
            mentality_words_dict = {}
            emotion_words_dict = {}

            # jieba切词出来,进行一次筛选,然后统计词数放入all_words_dict
            for word in word_list:
                if not word.isdigit() and word not in stopwords_set and 1 < len(word) < 5:
                    if word in all_words_dict:
                        all_words_dict[word] += 1
                    else:
                        all_words_dict[word] = 1

            # 获得词总数,用于计算TF词频
            word_num = len(word_list)
            for word in all_words_dict:
                if word not in words_dict_in_total:
                    count = 0
                    for comment in comments:
                        if word in comment:
                            count += 1
                    words_dict_in_total[word] = count
                # 心态:TF * IDF:
                # TF————>新闻为单位 出现关键词的个数/ 新闻为单位 词语总个数
                # IDF————>log(档案为单位 总评论数量 / 档案为单位 出现关键词的评论数量 + 1)
                mentality_words_dict[word] = all_words_dict[word]/word_num * math.log(comments_in_total/(words_dict_in_total[word]+1))

            # 进行从高到低排序获得新闻为单位 统计出来的心态列表mentality_words_list、心态列表emotion_words_list
            mentality_words_list = sorted_words_dict(mentality_words_dict)

            # 筛选最前20名的词,构成特征词列表
            mentality_feature_words = words_dict(mentality_words_list)

            # 依据心态特征词统计新闻的性质,使用score量化量化每则新闻观众心态
            # 即 若在正面心态词集合中score+=TFIDF值,若在负面心态词集合中score-=TFIDF值
            score = 0
            mentality_score_list = {"happy_love":0, "sad_guilty":0, "angry_hatred":0, "surprise_afraid":0}

            for unit in mentality_feature_words:
                if unit in test_positive_set:
                    score += mentality_words_dict[unit]
                if unit in test_negative_set:
                    score -= mentality_words_dict[unit]

            # 根据score参数,逐一统计正面/负面/中性 三种心态状态的新闻总量
            if score > 0:
                positive_num += 1
            elif score == 0:
                neutral_num += 1
            else:
                negative_num += 1

            # 依据心态特征词统计新闻的性质,使用字典emotion_score_list的元素 量化量化每则新闻观众心态
            # 即 若在某一心态词集合中,若在负面心态词集合中元素+=1
            for unit in mentality_feature_words:
                if unit in happy_love_set:
                    mentality_score_list["happy_love"] += mentality_words_dict[unit]
                if unit in sad_guilty_set:
                    mentality_score_list["sad_guilty"] += mentality_words_dict[unit]
                if unit in angry_hatred_set:
                    mentality_score_list["angry_hatred"] += mentality_words_dict[unit]
                if unit in surprise_afraid_set:
                    mentality_score_list["surprise_afraid"] += mentality_words_dict[unit]

            # 如果字典emotion_score_list元素值都为0,第五类other_emotion+=1
            if (mentality_score_list["happy_love"] == mentality_score_list["sad_guilty"] == mentality_score_list["angry_hatred"] == mentality_score_list["surprise_afraid"] == 0):
                mentality_list_num["other_emotion"] += 1
            # 将字典emotion_score_list元素值比较大小,最大者在字典emotion_list_num元素值+=1,若有两者相同两者同+=1/2,类推下去
            else:
                number = 0
                top_item_list = []
                for item in mentality_score_list:
                    if mentality_score_list[item] == mentality_score_list[max(mentality_score_list, key=mentality_score_list.get)]:
                        number += 1
                        top_item_list.append(item)
                for item in mentality_score_list:
                    if item in top_item_list:
                        mentality_list_num[item] += 1 / number
        except:
            pass
    return positive_num, neutral_num, negative_num, mentality_list_num
def main():
    # 基本分词函数
    segs = jieba.cut('我在学习自然语言处理')  # 精确模式
    print(list(segs))
    segs = jieba.cut('我在学习自然语言处理', cut_all=True)  # 全模式
    print(list(segs))
    segs = jieba.cut_for_search(  # 搜索引擎模式
        '小明硕士毕业于中国科学院计算所,后在哈佛大学深造')
    print(list(segs))
    segs = jieba.lcut('小明硕士毕业于中国科学院计算所,后在哈佛大学深造')  # lcut返回list
    print(segs)
    print(jieba.lcut('如果放到旧字典中将出错。'))
    jieba.suggest_freq(('中', '将'), True)  # 调节词频, 使其能够被分出来
    print(jieba.lcut('如果放到旧字典中将出错。'))
    print('-' * 100)

    # TF-IDF关键词抽取
    root_path = Path('/media/bnu/data/nlp-practice/jieba-tutorials')
    with open(root_path / 'NBA.txt') as f:
        lines = f.read()
        tags = jieba.analyse.extract_tags(lines,
                                          topK=20,
                                          withWeight=False,
                                          allowPOS=())
        print(tags)
    with open(root_path / '西游记.txt') as f:
        lines = f.read()
        tags = jieba.analyse.extract_tags(lines,
                                          topK=20,
                                          withWeight=False,
                                          allowPOS=())
        print(tags)
    print('-' * 100)

    # TextRank关键词抽取
    with open(root_path / 'NBA.txt') as f:
        lines = f.read()
        tags = jieba.analyse.textrank(lines,
                                      topK=20,
                                      withWeight=False,
                                      allowPOS=('ns', 'n'))
        print(tags)
    with open(root_path / '西游记.txt') as f:
        lines = f.read()
        tags = jieba.analyse.textrank(lines,
                                      topK=20,
                                      withWeight=False,
                                      allowPOS=('ns', 'n', 'vn', 'v'))
        print(tags)
    print('-' * 100)

    # 词性标注
    pseg = jieba.posseg.cut('我爱自然语言处理')
    for word, pos in pseg:
        print(word, pos)
    print('-' * 100)

    # 并行分词
    jieba.enable_parallel(4)
    with open(root_path / '西游记.txt') as f:
        lines = f.read()
        t1 = time.time()
        seg = list(jieba.cut(lines))
        t2 = time.time()
        print('Parallel Speed {} bytes/sec'.format(len(lines) / (t2 - t1)))

    jieba.disable_parallel()
    with open(root_path / '西游记.txt') as f:
        lines = f.read()
        t1 = time.time()
        segs = list(jieba.cut(lines))
        t2 = time.time()
        print('Non-Parallel Speed {} bytes/sec'.format(len(lines) / (t2 - t1)))
    print('-' * 100)

    # 词语在原文的起止位置
    tokens = jieba.tokenize('自然语言处理非常有用')  # 默认模式
    for token in tokens:
        print('{}\t\t start: {} \t\t end: {}'.format(token[0], token[1],
                                                     token[2]))
    tokens = jieba.tokenize('自然语言处理非常有用', mode='search')  # 搜索模式
    print('-' * 100)
    for token in tokens:
        print('{}\t\t start: {} \t\t end: {}'.format(token[0], token[1],
                                                     token[2]))
Example #32
0
def get_file_cut_word_parallel(filename, parallel=2):
    file_text = open(filename).read()
    jieba.enable_parallel(parallel)
    file_words = [x for x in jieba.cut(file_text) if len(x) >= 2]
    jieba.disable_parallel()
    return file_words
def text_processing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # 遍历文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        files = os.listdir(new_folder_path)
        # 读取文件
        j = 1
        for file in files:
            if j > 100:  # 怕内存爆掉,只取100个样本文件,你可以注释掉取完
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding='utf-8') as fp:
                raw = fp.read()
            ## 是的,随处可见的jieba中文分词
            # jieba.enable_parallel(4)  # 开启并行分词模式,参数为并行进程数,不支持windows
            '''
            NotImplementedError: jieba: parallel mode only supports posix system
            '''
            word_cut = jieba.cut(raw, cut_all=False)  # 精确模式,返回的结构是一个可迭代的genertor
            word_list = list(word_cut)  # genertor转化为list,每个词unicode格式
            jieba.disable_parallel()  # 关闭并行分词模式

            data_list.append(word_list)  # 训练集list
            '''
                class_list.append(folder.decode('utf-8'))  #类别
                AttributeError: 'str' object has no attribute 'decode'
            '''
            class_list.append(folder)  # 类别
            j += 1

    ## 粗暴地划分训练集和测试集
    data_class_list = zip(data_list, class_list)
    # print(data_class_list)
    '''
    TypeError: object of type 'zip' has no len()
    需要转 list
    '''
    data_class_list = list(data_class_list)
    random.shuffle(data_class_list)
    index = int(len(data_class_list) * test_size) + 1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)

    # 其实可用sklearn自带的部分做
    # train_data_list, test_data_list, train_class_list, test_class_list = sklearn.cross_validation.train_test_split(data_list, class_list, test_size=test_size)


    # 统计词频放入all_words_dict
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            '''
            if all_words_dict.has_key(word):
                报错 'dict' object has no attribute 'has_key'
                has_key方法在python2中是可以使用的,在python3中删除了。
                比如:if dict.has_key(word):
                改为:if word in dict:
            '''
            if word in all_words_dict:
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1

    # key函数利用词频进行降序排序
    all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True)  # 内建函数sorted参数需为list
    '''''
    all_words_list = zip(*all_words_tuple_list)[0]
    运行报错:TypeError: 'zip' object is not subscriptable
    因为 python 3.x 需要 list()后,再索引查找元素
    改为 all_words_list = list(zip(*all_words_tuple_list))[0]
    '''
    all_words_list = list(zip(*all_words_tuple_list))[0]

    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list