Beispiel #1
0
    def __init__(self, dict_paths):
        # TODO 加载外部词典
        for p in dict_paths:
            jieba.load_userdict(p)

        # TODO jieba不能正确切分的词语,我们人工调整其频率。
        jieba.suggest_freq(('特征', '症状', '症候'), True)
Beispiel #2
0
 def init_jieba(self, seg_dic, userdic):
     jieba.load_userdict(userdic)
     jieba.set_dictionary(seg_dic)
     with open(userdic, 'r', encoding='utf-8') as input:
         for word in input:
             word = word.strip('\n')
             jieba.suggest_freq(word, True)
Beispiel #3
0
def data_process(data_paths='./data/cnews/news_{}.txt'):
    stop_words = get_stopword()

    with open(data_paths.format(1),'r',encoding='utf8') as f:
        text_1 = f.read()
        text_1 = re.sub(r'\n','',text_1)
        text_1 = re.sub(r'\d+','',text_1)
    with open(data_paths.format(2),'r',encoding='utf8') as f:
        text_2 = f.read()
        text_2 = re.sub(r'\n','',text_2)
        text_2 = re.sub(r'\d+','',text_2)
    with open(data_paths.format(3),'r',encoding='utf8') as f:
        text_3 = f.read()
        text_3 = re.sub(r'\n','',text_3)
        text_3 = re.sub(r'\d+','',text_3)

    text = text_1 + text_2 + text_3

    jieba.suggest_freq(['易会满'])
    processed_text = ' '.join([word for word in jieba.cut(text) if word not in stop_words])
    with open('./data/processed_txt','w',encoding='utf8') as f:
        f.write(processed_text)

    processed_text = word2vec.LineSentence('./data/processed_txt')

    return processed_text
def gen_wordcloud(text, filename):

    # 1). 强调分割中有问题的词;
    jieba.suggest_freq(('微博'), True)
    jieba.suggest_freq(('热搜'), True)

    #  2). 难点: 如何切割中文, jieba, lcut
    result = jieba.lcut(text)
    print(result)

    # 绘制词云
    # 3). 打开图片, 获取图片的数据信息;
    imgObj = Image.open('./doc/wordcloud.jpg')
    img_mask = np.array(imgObj)
    # print(img_mask)
    # 4). 创建词云对象, 设置属性
    wcObj = wordcloud.WordCloud(
        mask=img_mask,  # 数据如何填充到图片
        background_color="snow",  # 北京颜色
        font_path=
        "/usr/share/fonts/wqy-zenhei/wqy-zenhei.ttc",  # 如果是中文, 指定字体库(fc-list :lang=zh)
        min_font_size=5,  # 图片中最小的字体大小
        max_font_size=50,  # 图片中最小的字体大小
        width=1000,  # 图片宽度
        height=1000,  # 高
    )
    # 5). 生成图片;
    # 词云绘制时, 默认之处理字符串类型, 怎么分隔每个单词? 必须以逗号分隔符分割
    wcObj.generate(",".join(result))
    wcObj.to_file(filename)
def outofdate_func():
    """
    Set suggest word in jieba & write the sentence word-cut to data/text.segment.txt
    :return:
    """
    list_suggest_word = []
    with open('data/suggest_frep.txt') as fr:
        line = fr.readline()
        while True:
            if not line:
                break
            lists = line.strip().split()
            list_suggest_word = list_suggest_word + lists
            line = fr.readline()
    for suggest_word in list_suggest_word:
        jieba.suggest_freq(suggest_word, True)

    with open('data/text.txt', 'r') as f:
        fw = open('data/text_segment.txt', 'w')
        while True:

            line = f.readline()
            if not line:
                line = f.readline()
                if not line:
                    break

            seg_list = jieba.cut(line, cut_all=False)

            new_line = " ".join(seg_list)

            # print(new_line)
            fw.write(new_line)
        fw.close()
    print('the cut processor done!')
Beispiel #6
0
def word_seg(src_path='MRP_Analysis_2.txt', self_dict_path='self_dict.txt'):

    #以只读模式打开需求文档
    txt = ''
    with open(src_path, 'r', encoding='utf-8') as f:
        txt = f.read()
        f.close()
    logging.debug('---------File read done---------!')
    logging.debug(txt)

    #结巴分词
    wordstr = ''
    try:

        try:
            jieba.load_userdict(self_dict_path)
        except FileNotFoundError:
            pass
        except Exception as e:
            raise e

        jieba.add_word('拜欧海多汀', freq=20000, tag=None)
        jieba.suggest_freq(('护', '将'), tune=True)

        #cut返回的是生成器,lcut返回的是列表.选择精准模式,关闭HMM模式,返回一个list.
        words = jieba.lcut(txt, cut_all=False, HMM=False)
        logging.debug(words)
        wordstr = ''.join(words)
        logging.debug(wordstr)

    except Exception as e:
        raise e

    return wordstr
def set_dict():
    """
    自定义词典
    一词一行 一行:词语  词频(可省)词性(可省)  空格分隔  **顺序不可颠倒 UTF-8编码**
    ```
    华能 3  nz
    云泥 ns
    河势 n
    庙沟 ns
    ```
    使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中**动态修改**词典。
    使用 suggest_freq(segment, tune=True) 可调节单个词语的词频,使其能(或不能)被分出来。
    注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。
    :return:
    """
    jieba.load_userdict('../Data/dict/30wChinsesSeqDic_clean.txt')
    jieba.add_word('加入自定义词')
    test_sent1 = jieba.cut('在南京市长江大桥研究生命的起源和加入自定义词')
    print('test_sent1', "/".join(test_sent1))
    jieba.del_word('加入自定义词')
    test_sent2 = jieba.cut('在南京市长江大桥研究生命的起源和加入自定义词')
    print('test_sent2', "/".join(test_sent2))
    jieba.suggest_freq('研究生命', True)
    test_sent3 = jieba.cut('在南京市长江大桥研究生命的起源和加入自定义词')
    print('test_sent3', "/".join(test_sent3))
Beispiel #8
0
def segment_words(stars):
    comments = None
    if stars == 'all':
        comments = data['Comments']
    else:
        comments = data[data['Star'] == stars]['Comments']
    comments_list = []
    for comment in comments:
        comment = str(comment).strip().replace('span', '').replace(
            'class', '').replace('emoji', '')
        comment = re.compile('1f\d+\w*|[<>/=]').sub('', comment)
        if len(comment) > 0:
            comments_list.append(comment)

    text = ''.join(comments_list)
    # word_list = list(jieba.cut(text))
    jieba.suggest_freq("无问西东", True)
    word_list = jieba.analyse.extract_tags(text,
                                           topK=50,
                                           withWeight=False,
                                           allowPOS=())
    print(word_list)
    c = Counter(word_list)
    print(c)
    common_c = c.most_common(50)
    print(common_c)
    #words = ''.join(word_list)

    return common_c
def _jiebaPOSRule():
    needRetain = [
        '去大脑',
        '广谱',  # 广谱抗生素
        '阳转',
    ]
    for retain_word in needRetain:
        jieba.suggest_freq(retain_word, tune=True)

    needExtract = [
        '体格检查',
        '光反应',
        '对光',
        '创伤性',
        '细菌性',
        '行为矫正',
        '粟粒状',
        # '安全性', # TA said don't split XX性 = =, but the given example need to split WTF
        '应予以',
        '常继发',
        # '迟发性',
        # '灵敏性',
        '若有阳',
        '完全恢复',
    ]
    for del_word in needExtract:
        jieba.del_word(del_word)
Beispiel #10
0
def split_word(my_str):
    jieba.suggest_freq('中美', True)
    jieba.add_word('中美')
    word_list = jieba.cut(my_str, cut_all=False)
    my_list = " ".join(word_list).split(' ')
    # pprint(my_list)
    return(my_list)
 def set_jieba(self):
     """
     Suggest frequency to jieba
     """
     for line in open('adjust_words').readlines():
         word = line.strip()
         jieba.suggest_freq(word, True)
Beispiel #12
0
    def writer_newword(self):
        dir_path = os.path.dirname(os.path.abspath(__file__))
        existWord = self.remove_dup('./jieba/industrydict/')
        infile = './jieba/industrydict/newword.dict'
        if os.path.isfile(infile):
            writer = open(infile, 'a+')
        else:
            writer = open(infile, 'wb')
        lword = self.read_newword()

        num = 0
        if lword:
            for sub in lword:
                if sub['word']:
                    word = sub['word'].encode('utf-8',
                                              'ignore').replace(' ', '')
                    sex = sub['word_sex'].encode('utf-8', 'ignore').strip()
                    if word not in existWord and sex:
                        freq = suggest_freq(word, True) if suggest_freq(
                            word, True) else 1
                        num += 1
                        writer.write('%s %s %s\n' % (word, freq, sex))
        logger.info('%s words have been writted in infile' % num)
        writer.close()
        logger.info('write newword task is finish')
Beispiel #13
0
def add_dict():
    # 导入自定义字典,这是在检查分词结果后自己创建的字典
    jieba.load_userdict("userdict.txt")
    dict1 = open("userdict.txt","r",encoding='utf8')
    #需要调整自定义词的词频,确保它的词频足够高,能够被分出来。
    #比如双肾区,如果在jiaba原有的字典中,双肾的频率是400,区的频率是500,而双肾区的频率是100,那么即使加入字典,也会被分成“双肾/区”
    [jieba.suggest_freq(line.strip(), tune=True) for line in dict1]
    
    #加载命名实体识别字典
    dic2 = csv.reader(open("DICT_NOW.csv","r",encoding='utf8'))
    for row in dic2:
        if len(row) ==2:
            jieba.add_word(row[0].strip(),tag=row[1].strip())
            jieba.suggest_freq(row[0].strip(),tune=True)
    
    # 用正则表达式匹配到的词,作为字典        
    fout_regex = open('regex_dict.txt','w',encoding='utf8')
    for file in os.listdir(path=c_root):
        if "txtoriginal.txt" in file:
            fp = open(c_root+file,"r",encoding="utf8")          
            for line in fp.readlines():
                if line.strip() :
                    #正则表达式匹配
                    p1 = re.compile(r'\d+[次度]').findall(line)
                    p2 = re.compile(r'([a-zA-Z0-9+]+[\.^]*[A-Za-z0-9%(℃)]+(?![次度]))').findall(line)
                    p_merge = p1+p2
                    for word in p_merge:
                        jieba.add_word(word.strip())
                        jieba.suggest_freq(word.strip(),tune=True)
                        fout_regex.write(word+'\n')
            fp.close()
    fout_regex.close()
Beispiel #14
0
def Initialization():
    jieba.suggest_freq('采购单', True)
    jieba.suggest_freq('采购提交', True)
    jieba.suggest_freq('玻尿酸', True)
    jieba.suggest_freq("新增采购", True)
    jieba.suggest_freq("水泥", True)
    jieba.del_word('采购提交水泥')
Beispiel #15
0
def _build_vocabulary(dictionary_path='../data/vocabulary.dict',
                      ngram=[2, 3],
                      filter=True):
    '''
    词表是一个很重要的影响因素,不过滤构造的词矩阵会OOM
    '''
    id2laws = pd.read_pickle('../cache/law_vocab.pkl')
    for id, laws in id2laws.items():
        for law in laws:
            jieba.add_word(law)
            jieba.suggest_freq(law)
    with codecs.open('../data/form-laws.txt', encoding='utf-8') as f:
        ls = f.readlines()
        for i, line in enumerate(ls):
            for law in re.findall('【(.*?)】', line):
                for word in law.split(';'):
                    jieba.add_word(word)
                    jieba.suggest_freq(word)

    if os.path.exists(dictionary_path):
        dictionary = corpora.Dictionary().load(dictionary_path)
    else:
        doc_list = pd.read_pickle('../cache/doc_list.pkl')
        doc_list_te = pd.read_pickle('../cache/doc_list_te.pkl')
        doc_list.extend(doc_list_te)

        with codecs.open('../data/form-laws.txt', encoding='utf-8') as f:
            ls = f.readlines()
        doc_list.extend(ls)

        # cor = [ token_extract(remove_previous_convictions(line) ) for line in doc_list ]
        cor = [token_extract(line, ngram=[]) for line in doc_list]
        dictionary = corpora.Dictionary(cor)

        if ngram:
            cor = [token_extract(line, ngram=[2, 3]) for line in doc_list]
            dictionary2 = corpora.Dictionary(cor)
            once_ids = [
                tokenid for tokenid, docfreq in dictionary2.dfs.items()
                if docfreq < 100
            ]
            dictionary2.filter_tokens(once_ids)
            dictionary2.compactify()

            print('len dictionary = %s' %
                  len(dictionary))  # len dictionary = 125156
            dict2_to_dict1 = dictionary.merge_with(dictionary2)
        if filter:
            once_ids = set([
                tokenid for tokenid, docfreq in dictionary.dfs.items()
                if docfreq < 3
            ])
            dictionary.filter_tokens(once_ids.union(stop_words))
            dictionary.compactify()
        print('len dictionary = %s' %
              len(dictionary))  #len dictionary = 125156
        dictionary.save(dictionary_path)
        del doc_list, doc_list_te
        gc.collect()
    return dictionary
Beispiel #16
0
    def transform_json(self, json_content):
        """ 将json存储为字典
        """
        result_dict = {}
        # topic_keywords_list eg: "悠闲": ["惬意", "休闲", "发呆", "放松", "宁静"]
        topic_keyword_dict = self.get_topic_keyword(
            pkg_resources.resource_filename(
                'HotelSentimentTagAnalysis.resource', 'topic_keyword.txt'))

        for json_line in json_content:
            for topic_word, keyword_critical_reverse in json_line.items():
                topic_word = topic_word.encode('utf8')
                # topic 下对应的 keywords
                keywords = set(topic_keyword_dict.get(topic_word, []))
                # 好 差 漂亮
                critical_word_set = set(
                    keyword_critical_reverse.get('critical_words', []))
                defaultValue = int(keyword_critical_reverse.get('default', 0))

                for each_keyword in keywords:
                    jieba.suggest_freq(each_keyword, True)
                    result_dict[each_keyword.decode('utf8')] = {
                        'topic_word': topic_word,
                        'default_value': defaultValue,
                        'critical_word_set': critical_word_set
                    }

        return result_dict
Beispiel #17
0
    def load_suggest_words(self, suggestwords_file=None):
        """
        Argument :

        suggestwords_file : suggest words file name
        
        
        loading suggest words
        
        """

        if type(suggestwords_file) is str:
            filename = suggestwords_file
        else:
            filename = util.get_default_suggest_words_file()

        f = codecs.open(filename, 'r', 'utf-8')
        for word in f:
            word = util.normalize_sentence(word.strip())
            if len(word):
                if '\t' in word:
                    w = word.split('\t')
                    jieba.suggest_freq((w[0].strip(), w[1].strip()), True)
                else:
                    jieba.suggest_freq(word.strip(), True)
Beispiel #18
0
def jieba_diy():
    diywords_filepath = PathUtil().diy_words_filepath
    jieba.load_userdict(diywords_filepath)
    # print(psg.lcut('什么是职业分类表'))
    for word_iter in jieba_add_words():
        jieba.add_word(word_iter)
    jieba.suggest_freq(['那', '不买'], tune=True)
 def set_jieba(self):
     """
     Suggest frequency to jieba
     """
     for line in open('adjust_words').readlines():
         word = line.strip()
         jieba.suggest_freq(word, True) 
def suggest_words(file_name,seg_list):
    #with open("suggest.txt", 'r', encoding='UTF-8') as f:
    with open(file_name, 'r', encoding='UTF-8') as f:
        lines=f.readlines()
        for data in lines:
            data=data.strip()
            jieba.suggest_freq(data, True)
Beispiel #21
0
def sep_words():
    sentence_treated_list = []
    stopwords = stopwords_list()
    jieba.load_userdict("/Users/chensx/Desktop/大学/毕业设计文献/THUOCL/THUOCL_it.txt")
    jieba.suggest_freq("下单", True)
    jieba.suggest_freq("WiFi", True)
    # Read review data
    sentence_csv = pd.read_csv('review_crawler/review.csv', index_col=0)
    sentence_list = sentence_csv['text']
    for sentence in sentence_list:
        sentence = sentence.rstrip('\n')
        sentence = emoji_filter(sentence)
        sentence_after = jieba.cut(sentence, cut_all=False)
        outstr = ''
        # If word in stopwords, delete
        for word in sentence_after:
            if word not in stopwords and word != ' ':
                outstr += word
                outstr += ' '
        print(outstr)
        sentence_treated_list.append(outstr.rstrip())

    # Write into csv
    data = pd.read_csv('review_crawler/review.csv', encoding='utf-8')
    data['after_treatment'] = sentence_treated_list
    data.to_csv('review_crawler/review.csv', encoding='utf-8', index=False)
Beispiel #22
0
    def segment_test(self, types=2, mode=True, noThu=True):
        """
        word segmentation
        @param types: 0->pkuseg, 1->jieba, 2->jieba_v2, 3->thulac, 4->thulac_v2, 5->pkuseg_v2
        @param mode: True-> prediction, False-> test
        """
        origin_word = self.origin_word if mode else self.test_origin
        # origin_word = ['小儿脑性瘫痪']

        '''pkuseg'''
        seg = pkuseg.pkuseg(model_name='medicine')
        pos_pkuseg = [' '.join(seg.cut(ii)) for ii in origin_word]
        seg = pkuseg.pkuseg(model_name='medicine',
                            user_dict='{}medicine_dict.txt'.format(pickle_dir))
        pos_pkuseg_v2 = [' '.join(seg.cut(ii)) for ii in origin_word]

        '''jieba'''
        pos_jieba = [' '.join(jieba.cut(ii)) for ii in origin_word]
        jieba.load_userdict(self.medicine_dict)
        jieba.suggest_freq('$$_', True)
        for ii in self.del_dict:
            jieba.del_word(ii)
        pos_jieba_v2 = [' '.join(jieba.cut(ii)) for ii in origin_word]

        '''thulac'''
        if not noThu:
            thu1 = thulac.thulac(seg_only=True)
            pos_thulac = [thu1.cut(ii, text=True) for ii in origin_word]
            thu2 = thulac.thulac(
                seg_only=True, user_dict='%smedicine_dict.txt' % pickle_dir)
            pos_thulac_v2 = [thu2.cut(ii, text=True) for ii in origin_word]

        if not mode:
            print('Pkuseg\n', pos_pkuseg)
            self.evaluation_pos(pos_pkuseg, self.test_seg)
            print('Pkuseg & medicine\n', pos_pkuseg_v2)
            self.evaluation_pos(pos_pkuseg_v2, self.test_seg)
            print('Jieba\n', pos_jieba)
            self.evaluation_pos(pos_jieba, self.test_seg)
            print('Jieba & medicine\n', pos_jieba_v2)
            self.evaluation_pos(pos_jieba_v2, self.test_seg)
            if not noThu:
                print('Thulac\n', pos_thulac)
                self.evaluation_pos(pos_thulac, self.test_seg)
                print('Thulac & medicine\n', pos_thulac_v2)
                self.evaluation_pos(pos_thulac_v2, self.test_seg)
            print('Reference\n', self.test_seg)

        if not types:
            self.pos_word = pos_pkuseg
        elif types == 1:
            self.pos_word = pos_jieba
        elif types == 2:
            self.pos_word = pos_jieba_v2
        elif types == 3:
            self.pos_word = pos_thulac
        elif types == 4:
            self.pos_word = pos_thulac_v2
        elif types == 5:
            self.pos_word = pos_pkuseg_v2
Beispiel #23
0
def go():
    comment_list = []
    for x in range(0, 50):
        url = 'https://movie.douban.com/subject/26931786/comments?start={}&limit=20&sort=new_score&status=P'.format(
            x * 20)
        text = requests.get(url).text
        html = etree.HTML(text)
        result = html.xpath("//div[@class='comment']//p//span/text()")
        if len(result) > 0:
            for comment in result:
                comment_list.append(comment)
    join = "".join(comment_list)
    jieba.suggest_freq(('钢铁侠'), True)
    cut_text = " ".join(jieba.cut(join))
    stopwords = set(STOPWORDS)
    stopwords.add('一部')
    stopwords.add('就是')
    stopwords.add('还是')
    stopwords.add('一个')
    stopwords.add('不过')
    stopwords.add('电影')
    wordcloud = WordCloud(stopwords=stopwords,
                          background_color="white",
                          width=1000,
                          font_path='simsun.ttc',
                          height=860,
                          margin=2).generate(cut_text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Beispiel #24
0
def preprocess_regardless_stopwords():
    jieba.enable_paddle()
    for name in names:
        jieba.suggest_freq(name, tune=True)

    with codecs.open("corpus.txt", 'w', 'utf-8') as standard:
        standard.seek(0)
        standard.truncate()

        for novel in os.listdir('resources/'):
            path = 'resources/' + novel
            print("novel " + novel + " start loading")

            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
                sentences = re.split("(。|!|\!|\.|?|\?)", text)
                print("there are " + str(len(sentences)) +
                      " sentences in this novel")

                new_sents = []

                for i in range(int(len(sentences) / 2)):
                    sent = sentences[2 * i] + sentences[2 * i + 1]
                    new_sents.append(remove_punctuation(sent))

                for sent in new_sents:
                    if sent != '':
                        split_sent = ' '.join(
                            entity_mapping(jieba.cut(sent, use_paddle=True)))
                        standard.write(split_sent + '\n')
            print("novel " + novel + " finished")
Beispiel #25
0
    def get_cipin(self):
        result = dict()
        cutResult = self.cut()
        jieba.suggest_freq(('服务', '质量'), True)
        resultList = list(cutResult)  # 转化成一个序列
        resultSet = set(resultList)  # 转化成一个集合
        # 转化成序列是为了转化成集合。
        # 转化成集合的目的是获取不重复的元素用于统计(下面的for循环中会使用)。
        # 创建一个存放统计结果的字典,拆分之后的单词作为key,单词出现的次数作为value。
        resultForstatistic = dict()

        # 词列表内的词词频
        keywords = {'服务', '关系', '质量', '忠诚度', '满意', '顾客', '员工'}
        # keywords = resultSet # 所有切出来的词统计词频
        for item in resultSet:
            resultForstatistic[item] = resultList.count(item)
        # 统计结果字典中没有的单词,其出现次数设为0。
        for keyword in keywords:
            try:
                keyCounts = resultForstatistic[keyword]
            except KeyError:
                keyCounts = 0
            result[keyword] = keyCounts
            # result.append((keyword, keyCounts))
        self.key_cipin = result
        return result
Beispiel #26
0
Datei: main.py Projekt: cwscc/NLP
def adjust_jieba_dict(adjust_word_file):
    f = open(adjust_word_file, encoding='utf-8')
    adjust_list = f.readlines()
    for i in adjust_list:
        jieba.suggest_freq(i.strip(), True)

    f.close()
Beispiel #27
0
 def other_discomm(self, third_house):
     # 提取出所有小区名,作为停用词(小区+desc中提取的小区)
     stop_word_comm = third_house['PropertyCommunity'].unique().tolist()
     # 从desc中提取小区
     ext_comm = []
     cond = third_house['PropertyCommunity'].isnull()
     for desc in third_house[cond]['HouseDesc'].unique():
         pattern = re.compile('.*?·(.*?)\s')
         res = pattern.findall(desc)
         try:
             ext_comm.append(res[0])
         except:
             ext_comm.append(desc)
     # 将desc中提取的小区和直接取出的小区求交集
     finally_comm = set()
     for comm in ext_comm + stop_word_comm:
         patt = re.compile('(.*?)[\(\(]')
         try:
             #         print(comm)
             res = patt.findall(comm)
             finally_comm.add(res[0])
         except:
             finally_comm.add(comm)
     lj_total_comm = pd.DataFrame(data=finally_comm, columns=['comm'])
     lj_total_comm.to_csv(self.community, index=None)
     # 将停用词文件读取出来,设置特定词语不被分开
     stop_word_set = set(pd.read_csv(self.community)['comm'].to_list())
     # 加载自己的停用词典
     jieba.load_userdict(self.community)
     # 调整词典,使特定的词语不被分开
     for word in stop_word_set:
         if word != None:
             jieba.suggest_freq(str(word), True)
Beispiel #28
0
def add_org(filename):
    with open(filename,'r',encoding='utf8') as f:
        lines = f.readlines()
        for row in lines:
            row =row.strip().split(",")
            if len(row)==2:
                jieba.add_word(row[0],tag=row[1])
                jieba.suggest_freq(row[0])
def segmentation(sentence,lists=[]):
    jieba.suggest_freq(u"刘超", True)
    #分词
    sentence = jieba.cut(sentence,cut_all=False,HMM=False)
    #组合
    format_sentence=",".join(sentence)
    lists=format_sentence.split(",")     
    return lists
Beispiel #30
0
def wordseg(sentence, alter):
    """
    分词函数
    :return:
    """
    jieba.suggest_freq(alter, True)
    seg_list = jieba.cut(sentence, cut_all=False)
    return " ".join(seg_list).split(" ")
Beispiel #31
0
def load_suggest_freq():
    if os.path.exists('./suggest_freq.txt'):
        f = open('./suggest_freq.txt', 'r')
        lines = f.readlines()
        for line in lines:
            words = line.split(' ')
            jieba.suggest_freq((words[0], words[1]), True)
        f.close()
Beispiel #32
0
    def init_jieba(self, seg_dic, userdic):

        """
        jieba custom setting.
        """
        jieba.load_userdict(userdic)
        jieba.set_dictionary(seg_dic)
        with open(userdic,'r',encoding='utf-8') as input:
            for word in input:
                word = word.strip('\n')
                jieba.suggest_freq(word, True)
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word("快递", 10000)
    jieba.suggest_freq(("面", "太厚"))
    jieba.suggest_freq(("价格", "便宜"))
    jieba.suggest_freq(("服务", "周到"))
    jieba.suggest_freq(("速度", "快"))
def segmentation(sentence,dicts=None):
    """
    接受一个字符串,转换成对应的词频字典及分词列表
    """
    jieba.suggest_freq("BOS", True)
    jieba.suggest_freq("EOS", True)
    sentence = jieba.cut(sentence,HMM=False)
    # format_sentence=",".join(sentence)
    #将词按","分割后依次填入数组word_list[]
    # lists=format_sentence.split(",")

    #统计词频,如果词在字典word_dir{}中出现过则+1,未出现则=1
    lists = [t for t in sentence]
    if isinstance(dicts, dict):
        for index, word in enumerate(lists):
            if index != 0:
                # 二元语法, 记录与前一个词一起出现的词频
                word_2 = '{}_{}'.format(lists[index-1], word)
                dicts.setdefault(word_2, 0)
                dicts[word_2] += 1
            dicts.setdefault(word, 0)
            dicts[word] += 1

    return lists
Beispiel #35
0
# 导入wordcount,用于制作词云图
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator



if __name__ == '__main__':
    # 获取所有评论
    comments = []
    with open('/Users/youpeng/zhizhi/beastcancer/qas.txt', mode='r', encoding='utf-8') as f:
        rows = f.readlines()
        i=0
        for row in rows:
            comments.append(row)

    # 设置分词
    jieba.suggest_freq('他莫昔芬', True)
    jieba.suggest_freq('他莫西芬', True)
    jieba.suggest_freq('它莫昔芬', True)
    jieba.suggest_freq('阿那曲唑', True)
    jieba.suggest_freq('诺雷德', True)
    jieba.suggest_freq('导管癌', True)
    jieba.suggest_freq('赫赛汀', True)
    jieba.suggest_freq('戈舍瑞林', True)




    comment_after_split = jieba.cut(str(comments), cut_all=False)  # 非全模式分词,cut_all=false

    words = ' '.join(comment_after_split)  # 以空格进行拼接
    print(words)
Beispiel #36
0
print( '/'.join( words ) )

print( "="*40 )

result = pseg.cut( test_sent )

for w in result:
    print( w.word, "/", w.flag, ", ", end = ' ' )

print( "\n" + "="*40 )

terms = jieba.cut( 'easy_install is great' )
print( '/'.join( terms ) )
terms = jieba.cut( 'python 的正则表达式是好用的' )
print( '/'.join( terms ) )

print( "="*40 )
# test frequency tune
testlist = [
( '今天天气不错', ( '今天', '天气' ) ),
( '如果放到post中将出错。', ( '中', '将' ) ),
( '我们中出了一个叛徒', ( '中', '出' ) ),
]

for sent, seg in testlist:
    print( '/'.join( jieba.cut( sent, HMM = False ) ) )
    word = ''.join( seg )
    print( '%s Before: %s, After: %s' % ( word, jieba.FREQ[word], jieba.suggest_freq( seg, True ) ) )
    print( '/'.join( jieba.cut( sent, HMM = False ) ) )
    print( "-"*40 )
# ###调整词典

# 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典。
# 使用 suggest_freq(segment, tune=True) 可调节单个词语的词频,使其能(或不能)被分出来。
# 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。

# In[7]:

print("/".join(jieba.cut("如果放到post中将出错。", HMM = False)))


# In[8]:

#利用调节词频使“中”,“将”都能被分出来
jieba.suggest_freq(("中", "将"), tune = True)


# In[9]:

print("/".join(jieba.cut("如果放到post中将出错。", HMM = False)))


# In[16]:

Original = "/".join(jieba.cut("江州市长江大桥参加了长江大桥的通车仪式。", HMM = False))
print "Original: " + Original


# In[21]:
Beispiel #38
0
print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')
print('/'.join(terms))

print("="*40)
# test frequency tune
testlist = [
('今天天气不错', ('今天', '天气')),
('如果放到post中将出错。', ('中', '将')),
('我们中出了一个叛徒', ('中', '出')),
]

for sent, seg in testlist:
    print('/'.join(jieba.cut(sent, HMM=False)))
    word = ''.join(seg)
    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
    print("-"*40)

# quit()
jieba.add_word('石墨烯')
seg_list = jieba.cut(p, cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut(p, cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut(p)  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search(p)  # 搜索引擎模式
words = jieba.cut(test_sent)
#print('/'.join(words))

#print("="*40)

result = pseg.cut(test_sent)

#for w in result:
#    print(w.word, "/", w.flag, ", ", end=' ')

#print("\n" + "="*40)

terms = jieba.cut('easy_install is great')
#print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')
#print('/'.join(terms))

#print("="*40)
# test frequency tune
testlist = [
('今天天气不错', ('今天', '天气')),
('如果放到post中将出错。', ('中', '将')),
('我们中出了一个叛徒', ('中', '出')),
]

for sent, seg in testlist:
#    print('/'.join(jieba.cut(sent, HMM=False)))
    word = ''.join(seg)
    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
#    print("-"*40)
Beispiel #40
0
        '点击','客服','QQ','微信'

       ]

    file_clean2 = open("all_text_clean2.txt",'w')

    with open("all_text_clean.txt",'r') as f:
        text_clean  = f.readlines()
        f.close()

    for line in text_clean:
        #print line
        find = False
        for words in key_words:
            if line.find(words) != -1 :
                find = True
                break
        if not find:
            file_clean2.write(line)
    file_clean2.close()

    with open("all_text_clean2.txt",'r') as f:
        text_clean2  = f.read()
        f.close()
    jieba.suggest_freq('融魔方',True)
    tags = jieba.analyse.extract_tags(text_clean2, topK=20)
    print(",".join(tags))

if __name__ == "__main__":
    main()
#!/usr/bin/python3
# coding: utf-8
import jieba
##################################################################
## suggest_freq(segment, tune=True) 可调节单个词语的词频, 使其能(或不能)被分出来
# suggest_freq() 每执行一次, 频率会增加 1
print(jieba.get_FREQ(('中', '将')))  # None
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))  # 如果/放到/post/中将/出错/。
print(jieba.suggest_freq(('中', '将'), True))  # 494; 意思是 中将 两个字要分开
print(jieba.get_FREQ('中'), jieba.get_FREQ('将'))  # 243191 122305
print(jieba.get_FREQ('中', '将'))  # 243191; 输出的是 中 的词频
print(jieba.get_FREQ(('中', '将')))  # None, 没有意义
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))  # 如果/放到/post/中/将/出错/。

print(jieba.get_FREQ('台中'))
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))  # 「/台/中/」/正确/应该/不会/被/切开
print(jieba.suggest_freq('台中', True))  # 69; 执行几次以后会增加...,
print(jieba.get_FREQ('台中'))
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))  # 「/台中/」/正确/应该/不会/被/切开
##################################################################
## "台中"总是被切成"台 中"; P(台中) < P(台) x P(中), "台中"词频不够导致其成词概率较低
# 解决方法: 强制调高词频
# jieba.add_word('台中') 或者 jieba.suggest_freq('台中', True)
##################################################################
## test frequency tune
testlist = [
    ('今天天气不错', ('今天', '天气')),
    ('如果放到post中将出错。', ('中', '将')),
    ('我们中出了一个叛徒', ('中', '出')),
]
for sent, seg in testlist:
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 默认模式

seg_list = jieba.cut("他来到了网易杭研大厦")
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))

print('='*40)
print('2. 添加自定义词典/调整词典')
print('-'*40)

print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
#如果/放到/post/中将/出错/。
print(jieba.suggest_freq(('中', '将'), True))
#494
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
#如果/放到/post/中/将/出错/。
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
#「/台/中/」/正确/应该/不会/被/切开
print(jieba.suggest_freq('台中', True))
#69
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
#「/台中/」/正确/应该/不会/被/切开

print('='*40)
print('3. 关键词提取')
print('-'*40)
print(' TF-IDF')
print('-'*40)
 def suggest_usr_dict(path):
     with open(path, 'r') as f:
         for l in f.xreadlines():
             word1, word2 = l.split(',')[0].rstrip(), l.split(',')[1].rstrip()
             jieba.suggest_freq((word1, word2), True)
f = open('/Users/xenia/Desktop/text.txt', 'r', buffering=-1, encoding='utf-8')
for line in f:
    print (line)
f.close()
"""

### 將 content 中文檔斷詞後,分行寫入 revised 中,並計算總詞數(不包含標點符號)
import jieba
import sys
sys.path.append("../")
import jieba.posseg
import jieba.analyse

content = open('/Users/xenia/Desktop/txt_file/text_1_1225_policy.txt', 'rb').read()
revised = open('/Users/xenia/Desktop/txt_file/out.txt', 'w', buffering=-1, encoding='utf-8')
jieba.suggest_freq(('民進黨'), True) #若有新詞、專有名詞,可隨時提高詞頻 or add_word
words=jieba.cut(content, cut_all=False)
term = 0
for word in words:
    revised.write(word)
    revised.write('\n')
    if word != ',' and word != '。' and word != ':' and word != ';' and word != '、' and word != '「' and word != '」' and word != '?' and word != '!' and word != ' ' and word != '\n' and word != '(' and word != ')' and word != '”' and word != '“': 
        term = term +1

revised.close()

### 針對 revised 中斷詞完成之文檔,搜尋目標詞,計算其出現次數及詞頻 (term frequency)
### 因 jieba 功能限制,計算「我」的次數,必須扣除「我們」的次數
import re
wordnumber = list()
regex=re.compile('立倫') #輸入目標詞
Beispiel #45
0
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 默认模式

seg_list = jieba.cut("他来到了网易杭研大厦")
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))

print("=" * 40)
print("2. 添加自定义词典/调整词典")
print("-" * 40)

print("/".join(jieba.cut("如果放到post中将出错。", HMM=False)))
# 如果/放到/post/中将/出错/。
print(jieba.suggest_freq(("中", "将"), True))
# 494
print("/".join(jieba.cut("如果放到post中将出错。", HMM=False)))
# 如果/放到/post/中/将/出错/。
print("/".join(jieba.cut("「台中」正确应该不会被切开", HMM=False)))
# 「/台/中/」/正确/应该/不会/被/切开
print(jieba.suggest_freq("台中", True))
# 69
print("/".join(jieba.cut("「台中」正确应该不会被切开", HMM=False)))
# 「/台中/」/正确/应该/不会/被/切开

print("=" * 40)
print("3. 关键词提取")
print("-" * 40)
print(" TF-IDF")
print("-" * 40)