Beispiel #1
0
def getkeywords():

    jieba.load_userdict("../user_dic")
    analyse.set_idf_path("idf.txt")  #file_name为自定义语料库的路径
    #    analyse.set_stop_words("stop_words.txt") #file_name为自定义语料库的路径。

    string = pd.read_csv('100w.csv', header=None)
    data = string.ix[:, :1].dropna()
    clfs = data[0].unique()

    k = len(clfs) / 12
    for i in range(k):
        print "..."
        t1 = time.time()
        Processes = []
        if i == k - 1:
            for clf in clfs[i * 12:]:
                pc = multiprocessing.Process(target=extract, args=(clf, data))
                Processes.append(pc)
        else:
            for clf in clfs[i * 12:(i + 1) * 12]:
                pc = multiprocessing.Process(target=extract, args=(clf, data))
                Processes.append(pc)

        for p in Processes:
            p.start()

        for p in Processes:
            p.join()

        t2 = time.time()
        print "%d seconds" % (t2 - t1)
Beispiel #2
0
 def __init__(
         self,
         opt,
         poem_file: str,
         song_file: str,
         keywords: List[str],
         out_file: str,
         base_dir_for_save=r'E:\\PycharmProjects\\FirstDayOnMS2\\Data\\Poem_Song',
         save_dir_for_songVec="songVec.pkl",
         save_dir_for_poemVec="poemVec.pkl",
         idf_path=None,
         additional_key_words_path=None,
         out_txt_dir=None,
         seg_point_field_name="seg_point"):
     super(MatchSeggedPoemSong,
           self).__init__(opt, poem_file, song_file, keywords,
                          base_dir_for_save, save_dir_for_songVec,
                          save_dir_for_poemVec)
     self.base_dir_for_save = base_dir_for_save
     print("PoemMatchSong.__init__(): keywords = ", self.keywords)
     self.out_file = out_file
     self.idf_path = idf_path
     self.seg_point_field_name = seg_point_field_name
     if self.idf_path is not None:
         analyse.set_idf_path(self.idf_path)
     self.additional_key_words = dict()
     if additional_key_words_path is not None:
         self.additional_key_words = self.load_additional_key_words(
             additional_key_words_path)
     self.modelSearchSong = SearchSong()
     self.out_txt_dir = out_txt_dir  #写到txt中供yaml调用
Beispiel #3
0
    def __init__(
            self,
            opt,
            poem_file: str,
            song_file: str,
            keywords: List[str],
            base_dir_for_save=r'E:\\PycharmProjects\\FirstDayOnMS2\\Data\\Poem_Song',
            save_dir_for_songVec="songVec.pkl",
            save_dir_for_poemVec="poemVec.pkl",
            idf_path=None):
        self.poem_file = poem_file
        self.song_file = song_file

        self.opt = opt

        self.base_dir_for_save = base_dir_for_save
        self.save_dir_for_songVec = save_dir_for_songVec
        self.save_dir_for_poemVec = save_dir_for_poemVec
        self.keywords = keywords
        self.poems = []
        self.sub_poems = []
        self.songs = []
        self.sub_songs = []
        self.songVecs = []
        self.poemVecs = []
        self.model = g.Doc2Vec.load(gensim_weight_path)
        self.idf_path = idf_path
        if idf_path is not None:
            analyse.set_idf_path(idf_path)
 def __init__(self, prose_path, idf_path=None, top_n=10, save_path=None):
     super(StaticPoem, self).__init__()
     self.idf_path = idf_path
     self.top_n = top_n
     if idf_path != None:
         analyse.set_idf_path(idf_path)
     self.poems = json.load(open(prose_path, "r", encoding="utf-8"))
     self.save_path = save_path
Beispiel #5
0
    def tf_if_parse(content, keywords=None, topK = 50):
        """ keywords must be include
        """
        import jieba.analyse as analyse
        import jieba

        tfidf_path = os.path.join(resource_dir,'resources','text','tf-idf.txt')

        user_dict_path = os.path.join(resource_dir,'resources','text','user_dict.txt')
        stopwords_path = os.path.join(resource_dir,'resources','text','stopwords-zh.txt')

        jieba.load_userdict(user_dict_path)
        analyse.set_stop_words(stopwords_path)
        analyse.set_idf_path(tfidf_path)
        tags = analyse.extract_tags(content, topK=topK)
        return tags
    def __init__(self,
                 seed_word_path,
                 prose_path,
                 proseSelcted=None,
                 top_n=10,
                 max_iter_num=3,
                 threshold=0.001,
                 ExtractMode="tfidf",
                 idf_path=""):
        '''
        :param seed_word_path: 种子词的文件路径
        :param sheet_name: 种子词excel文件中的sheetName
        :param prose_path: 文章的文件路径
        :param top_n: 取前top_n个种子词
        :param max_iter_num: 最大的迭代次数
        '''
        if seed_word_path is None or not os.path.exists(seed_word_path):
            raise Exception('初始种子词文件必须提供')

        self.seed_word_path = seed_word_path
        self.prose_path = prose_path
        self.proseSelcted = proseSelcted
        self.top_n = top_n
        self.max_iter_num = max_iter_num
        self.threshold = threshold
        self.ExtractMode = ExtractMode
        if ExtractMode == "tfidf" and idf_path:
            analyse.set_idf_path(idf_path)
        self.seed_words = set()
        self.seed_words |= self.loadSeedWord(seed_word_path)

        self.prosePastSelected = set()
        if proseSelcted is not None:
            self.prosePastSelected |= self.loadProseNumber(path=proseSelcted)

        #self.poems = self.getProse(prose_path)
        self.poems = pickle.load(open("poemsIdf.pkl", "rb"))
        print("散文加载成功")

        self.proseCurrentSelected = set()  #用当前这些关键词检索出来的文章
        self.wordCurrentSelected = set()  #用当前的这些文章检索出来的关键词
        self.wordCurrentSelectedDict = dict()

        self.ILLEGAL_CHARACTERS_RE = re.compile(
            r'[\000-\010]|[\013-\014]|[\016-\037]')
        self.PoemID2Index()
        print("初始化结束")
Beispiel #7
0
    def test_extract_tags(self):
        """
        3. 关键词抽取
        """
        topic = '关键词抽取'
        split_line = self.get_split_line(topic=topic)
        self.logger.info(split_line)

        term = '我们时人中国的可是is of super man'
        res = analyse.extract_tags(term)
        self.logger.info('{topic}_标准抽取: {term} -> {msg}'.format(topic=topic,
                                                                term=term,
                                                                msg=res))

        #  TODO: 自定义语料库运行有问题
        user_idf_path = os.path.dirname(__file__) + '/jieba_dict/idf.txt.big'
        analyse.set_idf_path(user_idf_path)
        res = analyse.extract_tags(term)
        self.logger.info('{topic}_自定义逆向文件频率: {term} -> {msg}'.format(
            topic=topic, term=term, msg=res))
Beispiel #8
0
def getkeywords2():

    jieba.load_userdict("../user_dic")
    analyse.set_idf_path("idf.txt")  #file_name为自定义语料库的路径
    #    analyse.set_stop_words("stop_words.txt") #file_name为自定义语料库的路径。

    print "connecting"
    mgclient = MongoClient(host='127.0.0.1')
    db_auth = mgclient.npl
    db_auth.authenticate("npl", "npl")
    db = mgclient.npl
    sale_info = db.npl_sale_info
    print "connected to npl_sale_info"

    cursor = sale_info.find({}, {
        "_id": 0,
        "sec_cate": 1,
        "detail": 1
    },
                            limit=10000000)
    results = DataFrame(list(cursor), columns=["sec_cate", "detail"])
    data = results[:].dropna()  # drop后index可能不连续
    data.columns = [0, 1]
    print data[:10]
    clfs = pd.read_csv('recalc.txt', header=None)

    t1 = time.time()
    Processes = []
    for clf in clfs[0]:
        pc = multiprocessing.Process(target=extract, args=(clf, data))
        Processes.append(pc)

    for p in Processes:
        p.start()

    for p in Processes:
        p.join()

    t2 = time.time()
    print "%d seconds" % (t2 - t1)
    def __init__(self,
                 idf_path: str = None,
                 user_dict_path: str = os.path.join(curr_dir, 'userdict.txt'),
                 stop_words_path: str = os.path.join(curr_dir,
                                                     'stop_words.txt'),
                 default_method: str = 'jieba.extract_tags'):
        """
        Methods:

        tfidf: customized TFIDF
        jieba.textrank: jieba's textrank
        jieba.extract_tags: jieba's tfidf?!
        jieba.tfidf: jieba's tfidf
        """
        if user_dict_path:
            jieba.load_userdict(user_dict_path)
        if idf_path:
            analyse.set_idf_path(idf_path)
        if stop_words_path:
            analyse.set_stop_words(stop_words_path)

        self.default_method = default_method
Beispiel #10
0
 def ExtractKeyWord(self):
     if self.idf_path is not None:
         analyse.set_idf_path(self.idf_path)
     for i, poem_dict in enumerate(self.poems):
         new_paras = []
         for j, para_dict in enumerate(poem_dict['paras']):
             all_content = '\n'.join(para_dict['para_content'])
             key_words = analyse.textrank(
                 all_content,
                 topK=10,
                 withWeight=True,
                 allowPOS=('ns', 'n', 'vn',
                           'v'))  #'v' #ns地名;n名词;vn名动词,比如思索;v动词
             idf_key_words = analyse.extract_tags(
                 all_content,
                 topK=10,
                 withWeight=True,
                 allowPOS=('ns', 'n', 'vn',
                           'v'))  #'v' #ns地名;n名词;vn名动词,比如思索;v动词
             para_dict[
                 'key_words'] = key_words  #with the type of List [Tuples('word',float)]
             para_dict['idf_key_words'] = idf_key_words
             new_paras.append(para_dict)
         self.poems[i]['paras'] = new_paras
Beispiel #11
0
sent = '好丑的证件照片'
print('/ '.join(jieba.cut(sent, HMM=False)))

jieba.suggest_freq(('证件照片'), True)
print('/ '.join(jieba.cut(sent, HMM=False)))

import jieba.analyse as aly

content = '''
自然语言处理(NLP)是计算机科学,人工智能,语言学关注计算机和人类(自然)语言之间的相互作用的领域。
因此,自然语言处理是与人机交互的领域有关的。在自然语言处理面临很多挑战,包括自然语言理解,因此,自然语言处理涉及人机交互的面积。
在NLP诸多挑战涉及自然语言理解,即计算机源于人为或自然语言输入的意思,和其他涉及到自然语言生成。
'''

#加载自定义idf词典
aly.set_idf_path('./data/idf.txt.big')
#加载停用词典
aly.set_stop_words('./data/stop_words.utf8')

# 第一个参数:待提取关键词的文本
# 第二个参数:返回关键词的数量,重要性从高到低排序
# 第三个参数:是否同时返回每个关键词的权重
# 第四个参数:词性过滤,为空表示不过滤,若提供则仅返回符合词性要求的关键词
keywords = aly.extract_tags(content, topK=10, withWeight=True, allowPOS=())

for item in keywords:
    # 分别为关键词和相应的权重
    print(item[0], item[1])
    
import jieba.analyse as aly
import sys
import os
from gensim.models import word2vec
import copy
import jieba
import math
from jieba import analyse
from math import sqrt
import jieba.posseg as pseg

reload(sys)
sys.setdefaultencoding('utf-8')
# model = word2vec.Word2Vec.load('Top250Word')
real_dir = os.path.split(os.path.realpath(__file__))[0]

analyse.set_idf_path(real_dir+"/idf_value.txt")
analyse.set_stop_words(real_dir+"/stop.txt")


class KeywordHandler:
	textrank = analyse.textrank
	tf_idf = analyse.extract_tags
	POS = ['ns','nt','nz','n','vn','an','a'] #词性限制为人名,地名,名词,动名词
	filter_set = set()
	allow_set = set()
	with open(real_dir+'/douban_tag.txt') as f:
		lines = f.readlines()
		for word in lines:
			allow_set.add(word[:-1].decode('utf-8'))

	print "Loading Idf Value"
Beispiel #13
0
def extract_keywords_by_tfidf(content, idf_path=None):
    if idf_path:
        analyse.set_idf_path(idf_path)
    keywords = "  ".join(analyse.extract_tags(content, topK=30, withWeight=False, allowPOS=()))
    return keywords
Beispiel #14
0
jieba.load_userdict('userdict.txt')#user dict defines specific words

#词性
import jieba.posseg as pseg 
words=pseg.cut(f.read().strip())
for word in words:
    print word.word, word.flag #词性形容词等

#jieba TF-IDF
import jieba
from jieba import analyse
tf_idf = analyse.extract_tags
tags = jieba.analyse.extract_tags(sentence,topK=20, withWeight=False, allowPos=())#待提取文本,返回K最大的关键词,是否一并返回权重值
#这个功能,jieba需要判断stopWords,用的自己的,需要判断IDF,也是用的自己的语料库。所以这个功能,在高精确度的地方,无法使用
analyse.set_stop_words("stop_words.txt")#自己设置停用词
analyse.set_idf_path(file_name)
keywords = tf_idf(text)

def stopWordsList(filepath):
    stopwords=[ line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

def seg_sentence(line):
    sentence_seg = jieba.cut_for_search(line.strip())
    stopwords = stopWordsList('/home/luyq/nlp/stopWords_ch.txt')
    outStr=""
    for word in sentence_seg:
        if word not in stopwords:
            if word != '\t':
                outStr += word
                outStr += " "
Beispiel #15
0
#-*- coding:utf-8 -*-
from __future__ import print_function
from __future__ import division
import os
import sys
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)

if sys.version_info[0] < 3:
    reload(sys)
    sys.setdefaultencoding("utf-8")

import jieba.analyse as analyzer
JIEBA_ANALYZER_IDF = os.path.join(curdir, os.path.pardir, os.path.pardir, "resources", "similarity.vocab.idf")
JIEBA_ANALYZER_STOPWORDS = os.path.join(curdir, os.path.pardir, os.path.pardir, "resources", "jieba_ext", "stop_words.txt")
analyzer.set_idf_path(JIEBA_ANALYZER_IDF)
analyzer.set_stop_words(JIEBA_ANALYZER_STOPWORDS)


def keywords(content, topK=10, vendor = "tfidf", title = None):
    words = []
    scores = []
    if vendor == 'tfidf':
        for x,y in analyzer.extract_tags(content, topK=topK, withWeight=True):
            words.append(x)
            scores.append(y)
    else:
        raise BaseException("Invalid vendor")
    return words, scores

 def self_define(self):
     jb_analyse.set_stop_words("../data/stopwords")
     jb_analyse.set_idf_path("../data/sk_tfidf.txt")