コード例 #1
0
    def fenci(self):
        print('开始分词...')
        fenciFileName = os.path.join(sys.path[0], self._city + '_分词结果.csv')
        CommentRecord = namedtuple(
            'CommentRecord',
            ['user', 'date', 'eval', 'star', 'votes', 'content'])

        analyse.set_stop_words(os.path.join(sys.path[0], '中文停用词表.txt'))
        content = []
        csvName = os.path.join(sys.path[0], self._city + '.csv')
        for emp in map(
                CommentRecord._make,
                csv.reader(open(csvName, mode='r', encoding='utf-8-sig'))):
            content.append(emp.content)
        tags = analyse.extract_tags(' '.join(content),
                                    topK=100,
                                    withWeight=True)
        with open(fenciFileName, 'w', encoding='utf-8-sig',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            [
                writer.writerow([item[0], str(int(item[1] * 1000))])
                for item in tags
            ]

        print('分词结束,保存结果在"%s"中...' % fenciFileName)
コード例 #2
0
def getCleanSubjiects(listSujects):
    objUtilString = StringUtil()
    #TfIdf的停用词
    jiebaAns.set_stop_words(PATH_JIEBA_STOP_WORDS)

    listRetSubjects = []
    for dictSubject in listSujects:
        #开始清洗数据
        oriContent = dictSubject['txt_content']
        if oriContent:
            #全角转半角
            cleanContent = objUtilString.tfQ2B(oriContent)
            #去掉符号
            cleanContent = objUtilString.replaceSubjectTag(cleanContent)
            #存回到集合中
            dictSubject['txt_content_par'] = cleanContent
            #添加关键词字段
            listContentKeywords = jiebaAns.extract_tags(cleanContent.lower(),
                                                        topK=20,
                                                        withWeight=False)
            dictSubject['txt_content_keywords'] = ','.join(listContentKeywords)
            #添加关键词词数字段
            dictSubject['txt_content_keywords_len'] = len(listContentKeywords)
            listRetSubjects.append(dictSubject)
        else:
            dictSubject['txt_content_par'] = ''
            dictSubject['txt_content_keywords'] = ''
            dictSubject['txt_content_keywords_len'] = ''
            listRetSubjects.append(dictSubject)
    return listRetSubjects
コード例 #3
0
def countCiYun():
    text = None
    with pymongo.MongoClient(host='127.0.0.1', port=27017) as client:
        comments = client.qichezhijia.qichezhijia1
        print('数据总条数count:', comments.estimated_document_count())
        # pymongo.cursor.Cursor
        cursor = comments.find()
        # 遍历数据,这里只遍历短评数据(好在数据量并不太大)
        text = ''.join(map(lambda doc: doc.get('comment'), cursor))

    #用户自定义分词
    jieba.load_userdict(r'../analysis/user_dict.txt')
    #屏蔽关键词列表
    analyse.set_stop_words(r'../analysis/stopwords.txt')

    m = collections.Counter(text)
    tags = analyse.extract_tags(text, topK=30, withWeight=False)
    #词云所有词列表
    new_text = ' '.join(tags)
    countFinalWordsList(text, new_text)

    # 对分词文本生成词云
    # 生成词云,需要指定支持中文的字体,否则无法生成中文词云
    wc = WordCloud(
        max_words=200,  # 设置词云最大单词数
        width=1099,  # 设置词云图片宽、高
        height=724,
        # 设置词云文字字体(美化和解决中文乱码问题)
        font_path=r'../example/fonts/FZXingKai-S04S.TTF').generate(new_text)

    # 绘图(标准长方形图)
    pyplot.imshow(wc, interpolation='bilinear')
    pyplot.figure()
    pyplot.axis('off')
    wc.to_file(r'../static/images/wc.png')
コード例 #4
0
ファイル: catchkeywords.py プロジェクト: mozihu/project_3
def tfidf_extract(text, keyword_num=5):
    # 输入PDF文本字符串,根据tfidf算法输出关键词
    tfidf = analyse.extract_tags
    analyse.set_stop_words('stopwords.txt')
    keywords = tfidf(text, keyword_num)

    return keywords
コード例 #5
0
def jiebase_content(readPath, JiebaPath, contentNums):
    fstop = "F:/graduationThesis/dataSet/test/stopWords.txt"
    stopwords = [line.strip("\r\n") for line in open(fstop, 'r').readlines()]
    fstop_sports = "F:/graduationThesis/dataSet/test/stopWords_sports.txt"
    stopwords_sports = [
        line.strip("\r\n") for line in open(fstop_sports, 'r').readlines()
    ]
    stopwordsSet_all = set(stopwords + stopwords_sports)
    f = open(readPath, 'r')
    fwrite = open(JiebaPath, 'w+')
    lineNums = 0
    analyse.set_stop_words(
        "F:/graduationThesis/dataSet/test/stopWords.txt")  #加载停用词
    for line in f.readlines():
        lineNums += 1
        lineList = line.strip("\r\n").split(":::")
        lineClass = lineList[0]
        lineTitle = lineList[1].split(" title ")[0]
        lineContent = lineList[1].split(" title ")[1]
        seg_title = analyse.extract_tags(lineTitle,
                                         topK=10,
                                         allowPOS=usenature)
        seg_content = analyse.extract_tags(lineContent,
                                           topK=contentNums,
                                           allowPOS=usenature)
        fwrite.write(lineClass.encode('utf-8') + ":::")
        for w in seg_title:
            tmpval = w.encode('utf-8')
            if (tmpval not in stopwordsSet_all):
                fwrite.write(tmpval + ",")
        for w_con in seg_content:
            tmpval_con = w_con.encode('utf-8')
            if (tmpval_con not in stopwordsSet_all):
                fwrite.write(tmpval_con + ",")
        fwrite.write("\n")
コード例 #6
0
def jiebase(readPath, JiebaPath):
    fstop = "F:/englisgpaper2/text/words/stopWords.txt"
    stopwords = [line.strip("\r\n") for line in open(fstop, 'r').readlines()]
    fstop_sports = "F:/englisgpaper2/text/words/stopWords_sports.txt"
    stopwords_sports = [
        line.strip("\r\n") for line in open(fstop_sports, 'r').readlines()
    ]
    stopwordsSet_all = set(stopwords + stopwords_sports)
    f = open(readPath, 'r')
    fwrite = open(JiebaPath, 'w+')
    lineNums = 0
    analyse.set_stop_words("F:/englisgpaper2/text/words/stopWords.txt")  #加载停用词
    for line in f.readlines():
        lineNums += 1
        lineList = line.strip("\r\n").split(":::")
        lineClass = lineList[0]
        lineTitle = lineList[1].split(" title ")[0]
        seg_title = analyse.extract_tags(lineTitle, topK=7, allowPOS=usenature)
        # seg_title = list(jieba.cut(lineTitle, cut_all=False))
        fwrite.write(lineClass.encode('utf-8') + ":::")
        for w in seg_title:
            tmpval = w.encode('utf-8')
            # for key, values in synonymdic.items():
            #     if(tmpval in values):
            #         tmpval = key
            if (tmpval not in stopwordsSet_all):
                fwrite.write(tmpval + ",")
        fwrite.write("\n")
コード例 #7
0
ファイル: Keywords.py プロジェクト: STHSF/EventsParser
    def run(self, document):

        # tk = Tokenizer()
        # document = tk.token(document)
        # 基于Hanlp库的关键词提取
        print("[Info] keywords by Hanlp:")
        keywords_hanlp = HanLP.extractKeyword(document, 20)
        # print ",".join(keyword for keyword in keywords_hanlp)

        # 基于jieba库的关键词抽取
        # 添加停用词
        analyse.set_stop_words(self.stopwords_path)
        # 引入TextRank关键词抽取接口
        textrank = analyse.textrank
        print "[Info] keywords by textrank:"
        # keywords_jieba = textrank(document, 8, allowPOS=['n', 'nr', 'ns', 'vn', 'v'])
        # keywords_jieba = textrank(document, 20, withWeight=True)
        keywords_jieba = textrank(document, 20)
        # 输出抽取出的关键词
        # print ",".join(keyword for keyword in keywords_jieba)

        # 两种关键词提取接口做交集
        print"[Info] 两个关键词提取方法取交集:"
        join_set = set(keywords_hanlp).intersection(set(keywords_jieba))
        # print ",".join(item for item in join_set)
        return join_set
コード例 #8
0
def CreateTagsByID(weibo_dir, dest_dir, emoji_path, customized_path, stopwords_path, topK = 40):
    '''
    遍历scraped_data夹中的所有文件,对同一ID的博文进行汇总,并根据TF-IDF提取属于该ID的标签,保存在user_tags文件夹中。
    Args:
        weibo_dir: 爬取的微博博文数据所在目录
        dest_dir: 聚合后,每个用户的TFIDF标签群保存目录
        emoji_path: 自制表情符地址(merge后)
        customized_path: 自制用户词典地址(merge后)
        stopwords_path: 停用词地址(merge后)
        topK: 选取最大topK个标签
    Returns:
    
    
    '''
    jieba.load_userdict(emoji_path)
    jieba.load_userdict(customized_path)
    analyse.set_stop_words(stopwords_path)
    jieba.enable_parallel()
    for filename in os.listdir(weibo_dir):
        res = defaultdict()
        if(filename[0] == "."):
            continue
        raw_data = pd.read_csv(weibo_dir + filename)
        text_per_uid = raw_data.groupby("uid")["weibotxt"].sum()
        for idx in text_per_uid.index:
            res[str(idx)] = jieba.analyse.extract_tags(text_per_uid[idx], topK = topK)
        with open(dest_dir + filename[:-4] + ".json", "w") as f:
            json.dump(res, f)
コード例 #9
0
def countCiYun():

    comments = mongoutil.getCollection1()
    print('数据总条数count:', comments.estimated_document_count())
    # pymongo.cursor.Cursor
    cursor = comments.find()
    # 遍历数据,把所有comment的字符都拼到一起
    text = ''.join(map(lambda doc: doc.get('comment'), cursor))

    #用户自定义分词
    jieba.load_userdict(r'../analysis/user_dict.txt')
    #屏蔽关键词列表
    analyse.set_stop_words(r'../analysis/stopwords.txt')

    m = collections.Counter(text)
    tags = analyse.extract_tags(text, topK=40, withWeight=False)
    #词云所有词列表
    new_text = ' '.join(tags)
    #countFinalWordsList(text,new_text)

    # 对分词文本生成词云
    # 生成词云,需要指定支持中文的字体,否则无法生成中文词云
    wc = WordCloud(
        max_words=200,  # 设置词云最大单词数
        width=1099,  # 设置词云图片宽、高
        height=724,
        # 设置词云文字字体(美化和解决中文乱码问题)
        font_path=r'../example/fonts/FZXingKai-S04S.TTF').generate(new_text)

    # 绘图(标准长方形图)
    pyplot.imshow(wc, interpolation='bilinear')
    pyplot.figure()
    pyplot.axis('off')
    wc.to_file(r'../static/images/wc_8_changanCS35PLUS.png')
コード例 #10
0
def main():
    # 0、背景图
    back_color = imageio.imread(back_img)  # 解析该图片
    # 1、读入txt文本数据
    text = get_text()
    # 2、结巴分词,默认精确模式。可以添加自定义词典userdict.txt,然后jieba.load_userdict(file_name) ,file_name为文件类对象或自定义词典的路径
    # 自定义词典格式和默认词库dict.txt一样,一个词占一行:每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒

    # cut_text = jieba.cut(text)
    # result = "/".join(cut_text)  # 必须给个符号分隔开分词结果来形成字符串,否则不能绘制词云
    analyse.set_stop_words("stopwords.txt")
    # withWeight=True为显示字符出现的频率,格式为[('aa',0.23),('a',0.11)]
    # 不加这个参数,或者参数值为false的时候格式为['a','b']
    result = analyse.extract_tags(text, topK=100, withWeight=True)

    # 3、生成词云图,这里需要注意的是WordCloud默认不支持中文,所以这里需已下载好的中文字库
    # 无自定义背景图:需要指定生成词云图的像素大小,默认背景颜色为黑色,统一文字颜色:mode='RGBA'和colormap='pink'

    wc = WordCloud(
        font_path=r"msyh.ttc",
        background_color='white',
        width=800,
        height=600,
        max_font_size=50,
        mask=back_color,  # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
        max_words=200)  # ,min_font_size=10)#,mode='RGBA',colormap='pink')
    # wc.generate(result)
    wc.fit_words(dict(result))
    # 基于彩色图像生成相应彩色
    image_colors = ImageColorGenerator(back_color)
    # 4、显示图片
    plt.imshow(wc)  # 以图片的形式显示词云
    plt.figure("词云图")  # 指定所绘图名称
    plt.axis("off")  # 关闭图像坐标系
    wc.to_file("result.png")  # 按照设置的像素宽高度保存绘制好的词云图,比下面程序显示更清晰
コード例 #11
0
ファイル: catchkeywords.py プロジェクト: mozihu/project_3
def textrank_extract(text, keyword_num=5):
    # 输入PDF文本字符串,根据textrank算法输出关键词
    textrank = analyse.textrank
    analyse.set_stop_words('stopwords.txt')
    keywords = textrank(text, keyword_num)

    return keywords
コード例 #12
0
def main():
    stopwords = [line.rstrip() for line in open('stopwords_gbk.txt', 'r')]
    # 使用自定义停用词集合
    analyse.set_stop_words("stopwords_utf-8.txt")
    # 读取数据库的集合
    table = read_from_mongodb()
    # 切分所有的文章,得到所有切分到的词
    news_content_cut_all = cut_words(table, stopwords)
    # 统计所有切分到的词
    all_words_count = count_words(news_content_cut_all)
    # print(all_words_count)
    # 得到关键词
    keywords = get_keywords(table)
    # 将得到的关键词转换为dataframe格式
    # keywords_dataframe = DataFrame(keywords[1])
    # keywords_dataframe.to_csv('../keywords_content.csv') # 存储
    # 统计得到的关键词
    keywords_count = count_words(keywords[0])
    print(keywords_count)
    # 读取指标文件,转换为字典便于后面对词分类
    words_index_dict = get_index_words(filepath='words_index.xlsx')
    # 将得到的所有关键词按指标进行分类
    classify_words_results = classify_words(words_index_dict, keywords_count)
    n = 0
    index_five = []
    for key, values in classify_words_results.items():
        print(key, 'values:', values)
        n += 1
        text_save(values, filename='index_set\{name}指标.txt'.format(name=key))

        if len(values) > 3:
            index_five = index_five + values[:3]
        else:
            index_five = index_five + values
    # 统计指标五的统计量
    text_save(index_five, filename=r'analysis_data\wordcloud.txt')
    print('指标个数:', n)
    print('classify_words_results:', classify_words_results)
    # 统计所有新闻来源
    medias = count_medias(table)
    count_list_sort = count_words(medias)
    print(count_list_sort)
    n = 0
    for i in count_list_sort:
        if i[1] < 5:
            n += 1
    print(n)
    text_save(count_list_sort[:20], filename='news_source.txt')

    # 统计指标一的各统计量
    total_count(table, medias, filepath='index_set\市级地点指标.txt')
    # 统计指标二的统计量
    top10_medias(count_list_sort)
    # 统计指标三的统计量
    news_timeseries(
        table, count_list_sort=count_list_sort, top_n=10, freq='A-DEC'
    )  # 'M'代表月,'W'代表周,'Q-DEC'代表正常季度划分,'A-DEC'代表每年指定十二月最后一个日历日为年末
    # 统计指标十一中的新闻总量
    news_timeseries(table, top_n=0, freq='A-DEC')
コード例 #13
0
    def __textrank_keyword_extractor(self,
                                     seg_list,
                                     keyword_num,
                                     stopword_file=None):
        if stopword_file is not None:
            analyse.set_stop_words(stopword_file)
        textrank = analyse.textrank

        return textrank("".join(seg_list), keyword_num)
コード例 #14
0
        def extract_keywords(x):
            uid = x[0]
            # prod
            # ja.set_stop_words("/home/spark/hxkTest/movie_data/stopwords_cn.txt")
            ja.set_stop_words("/home/spark/hxkTest/spark_script/weibo_user_summary/stopwords_cn.txt")

            # local test
            # ja.set_stop_words("stopwords_cn.txt")

            keywords = ja.extract_tags(x[1])
            return (uid, keywords)
コード例 #15
0
ファイル: Functions.py プロジェクト: Meowwwwwwwww/HeyCar
def wipeoff(car_comments_list):
    ana.set_stop_words("C:\\Users\\adminsistrator\\Desktop\\汽车销售数据文件\\停用词.txt")
    jieba_wipeoff_list = []
    for comment_list in car_comments_list:
        jieba_one_list = []
        for comments in comment_list:
            after_jieba = ana.extract_tags(comments, topK=20)
            for word in after_jieba:
                jieba_one_list.append(word)
        jieba_wipeoff_list.append(jieba_one_list)

    return jieba_wipeoff_list
コード例 #16
0
def textrank_extract(text, keyword_num):
    textrank = analyse.textrank
    analyse.set_stop_words(stopwords_path)
    keywords = textrank(text, keyword_num)
    # 输出抽取出的关键词
    word1 = ""
    count = 1
    for keyword in keywords:
        if (count == len(keywords)):
            word1 = word1 + keyword
        else:
            word1 = word1 + (keyword + "/ ")
        count += 1
    return word1
コード例 #17
0
def feature_laws():
    # law_name = ""
    #
    # part_no = 0
    # part_feature = []
    #
    # chapter_no = 0
    # chapter_feature = []
    #
    # section_no = 0
    # section_feature = []
    #
    # article_no = 0
    # article_feature = []

    ana.set_stop_words("./stopwords_law_feature.txt")
    laws = collection_law.find({}, {"_id": 0})
    for law in laws:
        # print json.dumps(law, ensure_ascii=False)
        law_name = law[u"名称"]
        print law_name
        # feature = ana.extract_tags(law_name, allowPOS=('n', 'v'))
        # print json.dumps(feature, ensure_ascii=False)
        parts = law[u"编"]
        for part in parts:
            part_no = part[u"编号"]
            part_feature = get_feature(part_no, part)
            # if part_feature:
            #     print json.dumps(part_feature, ensure_ascii=False)
            chapters = part[u"章"]
            for chapter in chapters:
                chapter_no = chapter[u"章号"]
                chapter_feature = get_feature(chapter_no, chapter)
                # if chapter_feature:
                #     print json.dumps(chapter_feature, ensure_ascii=False)
                sections = None
                try:
                    sections = chapter[u"节"]
                except KeyError:
                    print chapter_no
                for section in sections:
                    section_no = section[u'节号']
                    section_feature = get_feature(section_no, section)
                    articles = section[u"条"]
                    for article in articles:
                        article_no = article[u"条号"]
                        article_feature = get_feature(article_no, article)
                        item = {"法名": law_name, "编号": part_no, "章号": chapter_no, \
                                "节号": section_no, "条号": article_no, "词": article_feature}
                        collection_article_featured.insert(item)
コード例 #18
0
ファイル: word_cloud.py プロジェクト: cuiyulin77/other
def get_words_weight(this_today, tomorrow_day, es, key_list,filler):

    # 注意是filler,不是filter.filter是内置变量名
    if filler == '2':
        must_list = []
        for key in key_list:
            key_term = {"term": {"content": key}}
            must_list.append(key_term)

        query_json = {
            "bool": {
                "must": must_list, "filter": [{"range": {"publish_time": {
                    "gte": this_today,
                    "lt": tomorrow_day}}}]
            }
        }
    else:
        query_json = {
            "bool": {
                "must": [
                    {"terms": {
                        "content": key_list
                    }}
                ], "filter": [{"range": {"publish_time": {
                    "gte": this_today,
                    "lt": tomorrow_day}}}]
            }
        }

    res = es.search(index="spider", doc_type='article', body={"query": query_json, "size": 0})
    res_num = res["hits"]["total"]
    print('文章数量',res_num)
    txts = es.search(index='spider', doc_type='article', body={'query': query_json, 'size': res_num})
    hits = txts['hits']['hits']
    contents = []
    for hit in hits:
        # print(hit['_source']['content'],'\n')
        contents.append(hit['_source']['content'])
    # print(''.join(contents))

    words_str = ''.join(contents).replace("\n", '').replace(" ", '')

    txt_path = os.path.dirname(os.path.abspath(__file__))+'/停用词.txt'
    ana.set_stop_words(txt_path)
    words_list = ana.extract_tags(words_str, topK=20, withWeight=True)
    # print(len(words_list))
    print(words_list)

    return words_list
コード例 #19
0
def extract_keywords(articles, savePath, key_wordNum=10, isSave=False):
    corpus = []
    print(len(articles))
    for article in articles:
        analyse.set_stop_words('./停用词.txt')
        key_list = analyse.extract_tags(article, topK=key_wordNum)
        # stop_list = readDictList('./停用词.txt')
        # key_list = [w for w in jieba.cut(article) if w not in stop_list]
        # print(key_list)
        corpus.append(key_list)
        key_words = ','.join(key_list)
        if isSave:
            with open(savePath, 'a') as fp:
                fp.write(key_words + '\r\n')
    return corpus
コード例 #20
0
def frequency(text, topN):
    """
    TD-IDF算法进行词频统计
    :param text: 待统计文本
    :return: 关键词及词频
    """

    # 加载停用词
    set_stop_words("stopWordDict.txt")

    # 提取词频前N个的关键词存储到列表tags
    tags = extract_tags(sentence=text, topK=topN, withWeight=True)

    for item in tags:
        print(item[0], item[1])
コード例 #21
0
def analyse_textRank(matierals, extracts):
    """textRank算法下的关键词提取"""
    for extract in extracts:
        matirel = matierals[extract]
        text = ""
        text_item = ""
        for texts in matirel:
            text_item = "".join(texts)
        text += text_item
        text += " "
        anls.set_stop_words('../source/stopwords.txt')
        tags = anls.textrank(text, topK=100, withWeight=True)
        with open("../source/textrank.txt", "w", encoding="utf-8") as f:
            f.write("分析的章节:{}\n".format(extract))
            f.write("\n".join(
                ["{},rate:{}".format(tag[0], tag[1]) for tag in tags]))
コード例 #22
0
ファイル: text.py プロジェクト: actlea/TopicalCrawler
    def tf_if_parse(content, keywords=None, topK = 50):
        """ keywords must be include
        """
        import jieba.analyse as analyse
        import jieba

        tfidf_path = os.path.join(resource_dir,'resources','text','tf-idf.txt')

        user_dict_path = os.path.join(resource_dir,'resources','text','user_dict.txt')
        stopwords_path = os.path.join(resource_dir,'resources','text','stopwords-zh.txt')

        jieba.load_userdict(user_dict_path)
        analyse.set_stop_words(stopwords_path)
        analyse.set_idf_path(tfidf_path)
        tags = analyse.extract_tags(content, topK=topK)
        return tags
コード例 #23
0
ファイル: JBNLPImplement.py プロジェクト: gaohuan2020/KBQA
 def KeyWordsExtraction(self, sentence):
     if sentence == '' or sentence == None:
         return []
     if config.KeyWordsExtractionConfig != None:
         if "stopWordsPath" in config.KeyWordsExtractionConfig:
             if config.KeyWordsExtractionConfig["stopWordsPath"] != "":
                 stopWordsPath = config.KeyWordsExtractionConfig["stopWordsPath"]
                 if os.path.exists(stopWordsPath):
                     analyse.set_stop_words(stopWordsPath)
         if "topK" in config.KeyWordsExtractionConfig:        
             if config.KeyWordsExtractionConfig["topK"] != "" and config.KeyWordsExtractionConfig["topK"] > 0:
                 self.topk = int(config.KeyWordsExtractionConfig["topK"])
         if "withWeight" in config.KeyWordsExtractionConfig:  
             if config.KeyWordsExtractionConfig["withWeight"] != "":
                 self.withWeight = int(config.KeyWordsExtractionConfig["withWeight"])
     return analyse.extract_tags(sentence, topK=self.topk, withWeight=self.withWeight)
コード例 #24
0
def make_word_vector(AJJBQK):
    ana.set_stop_words("../data/stop_words.txt")
    tags = ana.extract_tags(AJJBQK,
                            topK=100,
                            withWeight=True,
                            allowPOS=('n', 'v'))
    text_vector = {"word": [], "vector": []}
    for tag in tags:
        word = tag[0]
        tf = tag[1]
        idf = get_idf("law_forecast_minshi1", "doc_word_reference", word)
        # print idf, word
        tf_idf = tf * idf
        text_vector['word'].append(word)
        text_vector['vector'].append(tf_idf)
        # print word, tf_idf
    return text_vector
コード例 #25
0
def tf_idf(filename):
    analyse.set_stop_words(
        'D:\\Python36 Project\\WuHanNLP_Dev\\stop_word\\1915stopwords.txt')
    with open(filename, 'rb') as f:
        data = json.loads(f.read())
        record = data['RECORDS']
        comment = []
        for item in record:
            print(item['id'])
            single_comment = item['comment_content']
            comment.append(single_comment)
    complete_text = ','.join(comment)
    seg = analyse.extract_tags(complete_text,
                               topK=20,
                               withWeight=True,
                               allowPOS=())
    for tag, weight in seg:
        print("%s %s" % (tag, weight))
コード例 #26
0
ファイル: weibo.py プロジェクト: qinyuanpei/weibo-spider
    def generateWordsFrequency(self, text):
        freqs = {}
        for token in text.split(','):
            if (token in freqs.keys()):
                freqs[token] += 1
            else:
                freqs[token] = 1
        print(freqs)

        analyse.set_stop_words("stopword.txt")
        # analyse.set_idf_path('idf.txt')
        tags = analyse.extract_tags(text,
                                    topK=100,
                                    withWeight=True,
                                    allowPOS=('ns', 'n', 'vn', 'v'))
        for tag in tags:
            print(tag)
        for x, w in jieba.analyse.textrank(text,
                                           withWeight=True,
                                           allowPOS=('ns', 'n', 'vn', 'v')):
            print('%s %s' % (x, w))
コード例 #27
0
def warning():
    result = {"pro": 0.0, "url": ""}
    code = request.args.get("keyword", '600000')
    date = datetime.strftime(datetime.now() - timedelta(days=180), "%Y-%m-%d")
    qs = db.session.query(Sh_A_Share.title).filter(Sh_A_Share.bulletindate >= date,
                                                   Sh_A_Share.stockcode == code).all()
    data = ''
    for i in qs:
        data = data + i.title + '\n'
    if data:
        jieba.load_userdict(os.path.join(os.path.join(os.path.dirname(__file__), 'static'), 'CompanyName.txt'))
        analyse.set_stop_words(os.path.join(os.path.join(os.path.dirname(__file__), 'static'), 'StopWords.txt'))
        cut_keyword = jieba.analyse.textrank(data, topK=100, withWeight=True, allowPOS=('n', 'g', 'a', 'ad', 'an'))
        keywords = dict()
        for key in cut_keyword:
            keywords[key[0]] = key[1]
        result["pro"] = calculate_probability(keywords=keywords)
        result["url"] = "http://39.108.60.79/image/{}.png".format(code)
        cloud(code=code, keywords=keywords)

    return result
コード例 #28
0
def build_revise_ranking_dict(tablename):
    """
    建立倒排表
    :return: List(tuple(str,int))
    """
    import jieba
    import jieba.analyse as analyse
    jieba.load_userdict(userdict)
    analyse.set_stop_words(stopwords_path)

    retrieval_index = []
    sql = "SELECT que_id, content from %s" % tablename
    mycursor.execute(sql)
    data = mycursor.fetchall()

    for i, val in data:
        words = list(analyse.extract_tags(val.strip(), topK=20))

        for w in words:
            retrieval_index.append((w, int(i)))

    return retrieval_index
コード例 #29
0
def ciYunType(cartype, msgtype):
    print 'cartype=' + cartype + ',msgtype=' + msgtype
    comments = mongoutil.getCollectionKoubei()
    cursor = comments.find({"type": cartype})
    typestr = cartype + '_' + msgtype
    print '车型消息类型typestr=' + typestr
    picname = 'wc_' + typestr + '.png'
    print '图片名称=' + picname
    text = ''.join(map(lambda doc: doc.get(msgtype + '_msg'), cursor))
    print text
    jieba.load_userdict(r'../config/userdict.txt')
    analyse.set_stop_words(r'../config/stopwords_' + msgtype + '.txt')

    m = collections.Counter(text)
    tags = analyse.extract_tags(text, topK=40, withWeight=False)
    #词云所有词列表
    new_text = ' '.join(tags)
    validtext = new_text.replace(' ', '')
    if validtext is None or validtext == '':
        return
    #圆形图设置开始
    #x,y = np.ogrid[:300,:300]
    #mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
    #mask = 355 * mask.astype(int)
    #圆形图设置结束

    wc = WordCloud(
        max_words=200,
        background_color='white',
        scale=1,  # 默认之为1。可以理解为生成的图片像素密度值,值越大,图片密度越高,越清楚。  
        width=1099,
        height=724,
        font_path=r'../example/fonts/FZXingKai-S04S.TTF').generate(new_text)

    # 绘图(标准长方形图)
    pyplot.imshow(wc, interpolation='bilinear')
    pyplot.figure()
    pyplot.axis('off')
    wc.to_file(r'../static/images/' + picname)
コード例 #30
0
    def __init__(self,
                 idf_path: str = None,
                 user_dict_path: str = os.path.join(curr_dir, 'userdict.txt'),
                 stop_words_path: str = os.path.join(curr_dir,
                                                     'stop_words.txt'),
                 default_method: str = 'jieba.extract_tags'):
        """
        Methods:

        tfidf: customized TFIDF
        jieba.textrank: jieba's textrank
        jieba.extract_tags: jieba's tfidf?!
        jieba.tfidf: jieba's tfidf
        """
        if user_dict_path:
            jieba.load_userdict(user_dict_path)
        if idf_path:
            analyse.set_idf_path(idf_path)
        if stop_words_path:
            analyse.set_stop_words(stop_words_path)

        self.default_method = default_method
コード例 #31
0
def jiebaSe(featureNums, fPath):
    f = open('F:/graduationThesis/dataSet/corpus4_sougou/sports.txt', 'r')
    fwrite = open(fPath, 'w+')
    keyWordsSet = set()
    linenums = 1
    lines = f.readlines()
    for i in range(18000, len(lines)):
        if (linenums >= 2000):
            break
        line = lines[i]
        linenums += 1
        analyse.set_stop_words(
            "F:/graduationThesis/dataSet/test/stopWords.txt")
        seg_list = analyse.extract_tags(line,
                                        topK=featureNums,
                                        withWeight=True,
                                        allowPOS=usenature)
        for i in range(len(seg_list)):
            keyWordsSet.add(seg_list[i][0].encode('utf-8'))
            fwrite.write(seg_list[i][0].encode('utf-8') + ":" +
                         str(round(seg_list[i][1], 2)) + ",")
        fwrite.write("\n")
    return list(set(keyWordsSet))
コード例 #32
0
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer , TfidfTransformer
from sklearn.decomposition import NMF
from collections import defaultdict
from bs4 import BeautifulSoup, NavigableString

from lxml.html.soupparser import fromstring
from bs4 import UnicodeDammit

import lxml.etree as ET

import jieba
import jieba.posseg as pseg
from jieba import analyse

jieba.load_userdict("new.dict_all")
analyse.set_stop_words('./    stopword.txt')

# import jieba.analyse
# jieba.analyse.load_stop_words("stop_words_list.txt")


def uri_to_file_name(uri):
    return uri.replace("/", "-")

sessions = {}

xpath_abstract = '''//div[@class='panel-body']/div[1]/text()'''

with open("data/sc.csv", "r") as sessions_file:
    for row in csv.DictReader(sessions_file, ['title', 'link', 'speaker']):  
        session_id = (row['title'])
コード例 #33
0
ファイル: pb.py プロジェクト: lqe/scrapyStudy
 #coding:utf-8

import codecs
import re
import hashlib
from collections import defaultdict
import jieba
import jieba.analyse
from jieba.analyse import set_stop_words
set_stop_words('stopword.txt')

def gen_keys_word_pbTXT():
    stop_word = set()
    with codecs.open('stopword.txt','rb','utf-8') as f:
        for line in f:
            stop_word.add(line.strip())
    contentLis = []
    an_contentLis = []
    with codecs.open('newsDataBref.txt','rb','utf-8') as f:
        for line in f:
            tmp = line.split('\t')
            if len(tmp) != 2:
                continue
            if tmp[0].strip() == '1':
                contentLis.append(re.sub(r'[<>\w0-9a-zA-Z]','',tmp[1].strip()))
            else:
                an_contentLis.append(re.sub(r'[<>\w0-9a-zA-Z]','',tmp[1].strip()))
    content = ''.join(contentLis)
    wordsGen = jieba.cut(content, cut_all=True)
    wordMap = defaultdict(int)
    for word in wordsGen:
コード例 #34
0
ファイル: text.py プロジェクト: actlea/TopicalCrawler
        stripped_input = content.decode('utf-8').translate(trans_table)

        return stripped_input

    def candidate_words(self, stripped_input):
        return stripped_input.split(' ')


import jieba
import jieba.analyse as analyse

user_dict_path = os.path.join(resource_dir,'resources','text','user_dict.txt')
stopwords_path = os.path.join(resource_dir,'resources','text','stopwords-zh.txt')

jieba.load_userdict(user_dict_path)
analyse.set_stop_words(stopwords_path)

class StopWordsChinese(StopWords):
    """Chinese segmentation
    """
    def __init__(self, language='zh'):
        super(StopWordsChinese, self).__init__(language='zh')

    def candidate_words(self, stripped_input):
        # jieba builds a tree that takes a while. avoid building
        # this tree if we don't use the chinese language

        return jieba.cut(stripped_input, cut_all=True)


def seg_text(content):
コード例 #35
0
ファイル: demo.py プロジェクト: denisyq/code
jieba.set_dictionary('dict.txt.big')#jieba use new dict words to cut
jieba.load_userdict('userdict.txt')#user dict defines specific words

#词性
import jieba.posseg as pseg 
words=pseg.cut(f.read().strip())
for word in words:
    print word.word, word.flag #词性形容词等

#jieba TF-IDF
import jieba
from jieba import analyse
tf_idf = analyse.extract_tags
tags = jieba.analyse.extract_tags(sentence,topK=20, withWeight=False, allowPos=())#待提取文本,返回K最大的关键词,是否一并返回权重值
#这个功能,jieba需要判断stopWords,用的自己的,需要判断IDF,也是用的自己的语料库。所以这个功能,在高精确度的地方,无法使用
analyse.set_stop_words("stop_words.txt")#自己设置停用词
analyse.set_idf_path(file_name)
keywords = tf_idf(text)

def stopWordsList(filepath):
    stopwords=[ line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

def seg_sentence(line):
    sentence_seg = jieba.cut_for_search(line.strip())
    stopwords = stopWordsList('/home/luyq/nlp/stopWords_ch.txt')
    outStr=""
    for word in sentence_seg:
        if word not in stopwords:
            if word != '\t':
                outStr += word
コード例 #36
0
ファイル: cut_words.py プロジェクト: lqe/scrapyStudy
import json
import jieba
import jieba.analyse


# 清空所有分词 set cut_word = null
# while cut——word还有null
#    获取文章--分词 -- 更新cut_word字段
#
db = BaseModels().get_db()
cur = db.cursor()


try:
    from jieba.analyse import set_stop_words
    set_stop_words('tmp/stopword.txt')
    print '加载 停用词'
except:
    print 'add stop_words error'
    pass

words=set()
cur.execute("update news set `cut_words` = null;") # 清空所有的分词
cur.execute("select count(1) from news where `cut_words` is null;")
total = cur.fetchone()[0]
skip, length = 0, 100
finished = 0
try:
    while True:
        affected_rows = cur.execute("select id, content from news where cut_words is null limit %s,%s", (skip, length))
        if affected_rows == 0:
コード例 #37
0
ファイル: clustering.py プロジェクト: wlf061/nlp
    x=[ transformedCorpus2Vec(x,num_topics) for x in x  ]



    #标准化
    x = StandardScaler().fit_transform(x)

    #聚类分析
    #do_cluster_dbscan(x, articles_content)
    do_cluster(x, articles_content)







if __name__ == '__main__':

    #设置NTLK全局停用词
    analyse.set_stop_words("stopwords.txt")

    do_lda_usemulticore()







コード例 #38
0
ファイル: text.py プロジェクト: actlea/TopicalCrawler
def load_stopwords(stopword_path):
    import jieba.analyse as analyse
    stopwords_path = os.path.join(resource_dir, 'resources', 'text', 'stopwords-zh.txt')
    analyse.set_stop_words(stopwords_path)