コード例 #1
0
ファイル: summary.py プロジェクト: liyaguo6/NLP
 def get_result(self, paragraph):
     self.paragraph = paragraph
     self.segments = pynlpir.segment(self.paragraph,
                                     pos_names='all',
                                     pos_tagging=False)
     self.key_words = pynlpir.get_key_words(self.paragraph,
                                            weighted=False,
                                            max_words=20)
     self.new_sentence_wordlist = [0] * len(self.key_words)
     key_words = pynlpir.get_key_words(self.paragraph,
                                       max_words=20,
                                       weighted=True)
     self.key_weight = [item[1] for item in key_words]
     sentence_dict = self.cal_text_simliarity()
     keys = list(sentence_dict.keys())
     val = list(sentence_dict.values())
     temp = sorted(
         list(map(val.index, heapq.nlargest(self.maxSumarySize, val))))
     for i in temp[:2]:
         if keys[i] != self.sentence()[0]:
             self.result.append(keys[i])
     self.result.insert(0, self.sentence()[0])
     if len(",".join(self.result)) < self.length:
         self.result.append(keys[temp[2]])
     return ",".join(self.result)
コード例 #2
0
 def test_get_key_words(self):
     """Tests that the get_key_words() function works as expected."""
     s = '我们都是美国人。'
     key_words = pynlpir.get_key_words(s)
     weighted_key_words = pynlpir.get_key_words(s, weighted=True)
     expected_key_words = ['美国']
     expected_weighted_key_words = [('美国', 0.01)]
     self.assertEqual(expected_key_words, key_words)
     self.assertEqual(expected_weighted_key_words, weighted_key_words)
コード例 #3
0
ファイル: test_init.py プロジェクト: jiangdong123/pynlpir
 def test_get_key_words(self):
     """Tests that the get_key_words() function works as expected."""
     s = '我们都是美国人。'
     key_words = pynlpir.get_key_words(s)
     weighted_key_words = pynlpir.get_key_words(s, weighted=True)
     expected_key_words = ['美国']
     expected_weighted_key_words = [('美国', 0.01)]
     self.assertEqual(expected_key_words, key_words)
     self.assertEqual(expected_weighted_key_words, weighted_key_words)
コード例 #4
0
ファイル: test_init.py プロジェクト: iamxiaomu/pynlpir
 def test_get_key_words(self):
     """Tests that the get_key_words() function works as expected."""
     s = "我们都是美国人。"
     key_words = pynlpir.get_key_words(s)
     weighted_key_words = pynlpir.get_key_words(s, weighted=True)
     expected_key_words = ["美国"]
     expected_weighted_key_words = [("美国", 2.2)]
     self.assertEqual(expected_key_words, key_words)
     self.assertEqual(expected_weighted_key_words, weighted_key_words)
コード例 #5
0
def load_doc_list():
    pynlpir.open()
    doc_list = os.listdir(SOURCE_DOC_DIR_PATH)
    segment_list = []
    for doc in doc_list:
        fr = codecs.open(SOURCE_DOC_DIR_PATH + doc, 'r', 'utf-8')
        line_list = fr.read()
        fr.close()
        '''
        line_list = line_list.split(NEW_LINE)
        line_list.pop()
        # seg_str = ''
        for i in range(len(line_list)):
            segment = pynlpir.segment(line_list[i], pos_tagging=False)
            seg_str = ''
            for seg in segment:
                seg_str += seg + ' '
            line_list[i] = seg_str.strip()
        # segment_list.append(' '.join(line_list))
        temp_str = ' '.join(line_list)
        '''
        key_word_list = pynlpir.get_key_words(line_list, max_words=10, weighted=True)
        for key_word in key_word_list:
            print(key_word[0], '\t', key_word[1])
        pynlpir.close()
        exit(0)
コード例 #6
0
def GetKeyWorld(filePath):  #使用PYNLPIR getkeywokld来实现
    #filePath='/home/yuanzhu/Desktop/NewsData/20190603/20190603419.json'
    try:
        pr.open()
        #filePath='/home/yuanzhu/Desktop/NewsData/20190501/20190501181.json'
        dicNews = GetDictFromJsonFile(filePath)
        content = dicNews['content']
        # segs=pr.segment(content)
        # for seg in segs:
        #     print(seg)
        tupkeywords = pr.get_key_words(
            content, weighted=True)  #使用TF-IDF算法提取关键词(貌似还挺有效果)
        keywords = []
        for i, w in enumerate(tupkeywords):
            keywords.append(w[0])
            if i == 9:
                break
            i += 1
    except Exception as e:
        strLogErr = 'Get  {}  keyworld error :{}'.format(filePath, e)
        print(strLogErr)
        return None
    print("FilePath=", filePath)
    print('获取热点:', keywords)
    return keywords
コード例 #7
0
ファイル: demo.py プロジェクト: liyaguo6/NLP
 def __init__(self, paragraph, maxSumarySize=2):
     self.paragraph = paragraph
     self.maxSumarySize = maxSumarySize
     self.segments = pynlpir.segment(paragraph,
                                     pos_names='all',
                                     pos_tagging=False)
     self.key_words = pynlpir.get_key_words(paragraph,
                                            weighted=False,
                                            max_words=20)
     self.new_sentence_wordlist = [0] * len(self.key_words)
     key_words = pynlpir.get_key_words(paragraph,
                                       max_words=20,
                                       weighted=True)
     self.key_weight = [item[1] for item in key_words]
     self.sentence_simlarity = {}
     self.result = []
コード例 #8
0
ファイル: main.py プロジェクト: zhang-weiming/school_app
def get_key_words():
    s = ''
    max_words = MAX_WORDS_DEFAULT
    max_hot_words = MAX_HOT_WORDS_DEFAULT
    update_hot_word = UPDATE_HOT_WORD_DEFAULT
    # get doc
    if request.method == 'POST':
        s = request.form.get('s', type=str, default='')
        update_hot_word = request.form.get('update_hot_word', type=str, default=UPDATE_HOT_WORD_DEFAULT) # 是否更新hot_word表
        try:
            max_words = request.form.get('max_words', type=str, default=MAX_WORDS_DEFAULT)
            if max_words != '': # 有max_words参数(可能是默认值'3')
                print('[POST] max_words yes')
                max_words = int(max_words.strip())
                print('\tmax_words =', max_words)
            else:
                max_words = MAX_WORDS_DEFAULT
                print('[POST] max_words no')
        except: # max_words参数处理异常,设置默认值3
            max_words = MAX_WORDS_DEFAULT
        try:
            max_hot_words = request.form.get('max_hot_words', type=str, default=MAX_HOT_WORDS_DEFAULT)
            if max_hot_words != '':
                max_hot_words = int(max_hot_words.strip())
            else:
                max_hot_words = MAX_HOT_WORDS_DEFAULT
        except:
            max_hot_words = MAX_HOT_WORDS_DEFAULT
    elif request.method == 'GET':
        s = request.args.get('s')
        update_hot_word = request.args.get('update_hot_word')
        if update_hot_word != 'False':
            update_hot_word = 'True'
        try:
            max_words = int(request.args.get('max_words').strip())
        except:
            max_words = MAX_WORDS_DEFAULT
        try:
            max_hot_words = int(request.args.get('max_hot_words').strip())
        except:
            max_hot_words = MAX_HOT_WORDS_DEFAULT
    # get key words
    if s == '': # 文章内容为空,不分析
        return 'null'
    else: # 分析关键词
        pynlpir.open()
        key_word_list = pynlpir.get_key_words(s, max_words=max_words, weighted=False)
        # temp_str = ''
        for i in range(len(key_word_list)):
            key_word_list[i] = key_word_list[i]
        pynlpir.close()
        if update_hot_word == 'True':
            # 新开一个线程,更新数据库
            print('[update_hot_word] True')
            t = threading.Thread(target=db_helper.update_tables, args=(','.join(key_word_list), max_hot_words))
            t.setDaemon(True)
            t.start()
        else:
            print('[update_hot_word] False')
        return ','.join(key_word_list)
コード例 #9
0
def readF(path, n=0):
    for file in os.listdir(path):
        f=open(path+file,'r')
        s=f.read()
        x=pynlpir.get_key_words(s,weighted=True)
        dic={}
        for i in x:
            dic[i[0]]=i[1]
        vct.append(dic)
コード例 #10
0
ファイル: word.py プロジェクト: zjc-enigma/react-demo
def allword_by_pynlpir(inputfile, word_dict, max_words=1000):

    weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words)

    for word, weight in weighted_word_list:
        try:
            word_dict.setdefault(word, 0)
            word_dict[word] += weight
        except Exception as e:
            print (e)
コード例 #11
0
ファイル: jieba&nlpir.py プロジェクト: Zhaoyu620/First-pyltp
def work2():  # nlpir词性分析与关键词提取
    pynlpir.open()
    s = '因为明天是周三,所以我要有数据结构课,然而这课好难。'
    segments = pynlpir.segment(s, pos_names='all', pos_english=False)  # 全分析
    for segment in segments:
        print(segment[0], '\t', segment[1])
    key_words = pynlpir.get_key_words(s, weighted=True)  # 关键词提取
    for key_word in key_words:
        print(key_word[0], '\t', key_word[1])
    pynlpir.close()
コード例 #12
0
ファイル: NLPtools.py プロジェクト: hongyuzhou/nltk
def get_key_words(text):
    pynlpir.open()
    result = []
    keywords = pynlpir.get_key_words(text, weighted=True)
    if len(keywords) == 0:
        return result
    for i in range(len(keywords)):
        keyword = keywords[i][0]
        result.append(keyword)
    pynlpir.close()
    return result
コード例 #13
0
ファイル: key_word.py プロジェクト: lvleilei/screen
def nlpir_keywords(text, n):
    pynlpir.open()
    # print '关键词测试:\n'
    key_words = list(pynlpir.get_key_words(text, n, weighted=False))
    # for key_word in key_words:
    #     print key_word[0], '\t', key_word[1]

    pynlpir.close()

    print key_words
    return key_words
コード例 #14
0
ファイル: key_word.py プロジェクト: SwoJa/ruman
def nlpir_keywords(text,n):
	pynlpir.open()
	# print '关键词测试:\n'
	key_words = list(pynlpir.get_key_words(text,n,weighted=False))
	# for key_word in key_words:
	#     print key_word[0], '\t', key_word[1]
	 
	pynlpir.close()
	
	print key_words
	return key_words
コード例 #15
0
ファイル: NLPtools.py プロジェクト: hongyuzhou/nltk
def get_key_words(text):
    pynlpir.open()
    result = []
    keywords = pynlpir.get_key_words(text, weighted=True)
    if len(keywords) == 0:
        return result
    for i in range(len(keywords)):
        keyword = keywords[i][0]
        result.append(keyword)
    pynlpir.close()
    return result
コード例 #16
0
ファイル: word.py プロジェクト: stantoxt/react-demo
def allword_by_pynlpir(inputfile, word_dict, max_words=1000):

    weighted_word_list = pynlpir.get_key_words(inputfile,
                                               weighted=True,
                                               max_words=max_words)

    for word, weight in weighted_word_list:
        try:
            word_dict.setdefault(word, 0)
            word_dict[word] += weight
        except Exception as e:
            print(e)
def find_keyword(text, keyword_dict):
    keyword_list = []
    keyword_pair_list = pynlpir.get_key_words(text, weighted=True)
    for keyword_pair in keyword_pair_list:
        keyword_list.append(keyword_pair[0])
    keyword_id_list = []
    for keyword in keyword_list:
        try:
            keyword_id_list.append(keyword_dict[keyword])
        except:
            keyword_id_list.append(keyword)
    return keyword_id_list
コード例 #18
0
ファイル: word.py プロジェクト: zjc-enigma/react-demo
def allclass_by_pynlpir(inputfile, word_dict, max_words=1000):

    weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words)

    for word, weight in weighted_word_list:
        try:
            word_class = word_to_class(word)
            k = word + "\t" + word_class
            word_dict.setdefault(k, 0)
            word_dict[k] += weight
        except Exception as e:
            print (e)
コード例 #19
0
ファイル: doc_process.py プロジェクト: xiaol/ml_recommend
def cut_pos_nlpir(doc, topK=20):
    #s = filter_tags(doc)
    soup = BeautifulSoup(doc, 'lxml')
    s = soup.get_text()
    try:
        s = ''.join(s.split())
        ws = pynlpir.get_key_words(s, topK)
        return ' '.join(ws).encode('utf-8')
    except:
        print 'error:  ' + s
        traceback.print_exc()
        raise
コード例 #20
0
ファイル: word.py プロジェクト: stantoxt/react-demo
def allclass_by_pynlpir(inputfile, word_dict, max_words=1000):

    weighted_word_list = pynlpir.get_key_words(inputfile,
                                               weighted=True,
                                               max_words=max_words)

    for word, weight in weighted_word_list:
        try:
            word_class = word_to_class(word)
            k = word + "\t" + word_class
            word_dict.setdefault(k, 0)
            word_dict[k] += weight
        except Exception as e:
            print(e)
コード例 #21
0
ファイル: word.py プロジェクト: zjc-enigma/react-demo
def word_by_pynlpir(inputfile, word_dict, max_words=1000):

    weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words)

    for word, weight in weighted_word_list:
        try:
            word_class = word_to_class(word)
            if word_class in ['time word', 'numeral', 'adverb', 'verb', \
                'locative word', 'distinguishing word']: continue
            if len(word) < 2: continue
            word_dict.setdefault(word, 0)
            word_dict[word] += weight
        except Exception as e:
            print ("exception " , e)
コード例 #22
0
def st_WordCloud():
    # 生成三体词云
    in_text = codecs.open('data/st.txt', 'r', encoding='UTF-8').read()
    pynlpir.open()

    nlpir.AddUserWord(c_char_p("三体".encode()))
    nlpir.AddUserWord(c_char_p("罗辑".encode()))
    key_words = pynlpir.get_key_words(in_text, max_words=300, weighted=True)
    # 停用词
    stopwords = pd.read_csv("data/stop_words.txt",
                            index_col=False,
                            quoting=3,
                            sep="\n",
                            names=['stopword'],
                            encoding='utf-8')
    words = [word for word, wegiht in key_words]
    keywords_df = pd.DataFrame({'keywords': words})
    # 去掉停用词
    keywords_df = keywords_df[~keywords_df.keywords.isin(stopwords.stopword.
                                                         tolist())]

    word_freq = []
    for word in keywords_df.keywords.tolist():
        for w, k in key_words:
            if word == w:
                word_freq.append((word, k))

    pynlpir.close()
    print(word_freq)

    font = r'C:\Windows\Fonts\msyh.ttc'  # 指定字体,不指定会报错
    # color_mask = imread("resource/ge.jpg")  # 读取背景图片
    color_mask = imread("resource/timg.jpg")  # 读取背景图片
    wcloud = WordCloud(
        font_path=font,
        # 背景颜色
        background_color="white",
        # 词云形状
        mask=color_mask,
        # 允许最大词汇
        max_words=2000,
        # 最大号字体
        max_font_size=80)

    wcloud.generate_from_frequencies(dict(word_freq))
    # 以下代码显示图片
    plt.imshow(wcloud)
    plt.axis("off")
    plt.show()
    wcloud.to_file("data/wcimage/三体词云_2.png")
コード例 #23
0
def main():
    pynlpir.open()
    # pynlpir.nlpir.AddUserWord(c_char_p("手机壳".encode()))
    # pynlpir.nlpir.AddUserWord(c_char_p("炫亮".encode()))
    # text = '弗洛米iPhone7/7plus手机壳/保护套苹果7plus超薄全包硅胶透明电镀软壳5.5英寸炫亮黑☆炫亮电镀'
    # text="“赶考”五年成绩非凡 全面从严治党永远在路上"
    text = codecs.open('data/new.txt', 'r', encoding='UTF-8').read()
    r_out = pynlpir.segment(text, pos_english=False)
    key_words = pynlpir.get_key_words(text, weighted=True)

    pynlpir.close()
    print(key_words)

    for x in r_out:
        print(x)
コード例 #24
0
ファイル: queryExpansion1.py プロジェクト: colinsongf/NLPCC
def qe(row):
    question = row["question"]
    html = getHtmlbyQuestion(question)
    if html == None:
        return 0  # test
    properties = getProperties(html)
    discription = getDiscription(html)

    keywords = pynlpir.get_key_words(question, weighted=False)  #True
    weightedDict1 = matchKeyWords(keywords, discription)
    weightedDict2 = proExpansion(keywords, properties)
    dictMerged = weightedDict1.copy()
    dictMerged.update(weightedDict2)  #所有{扩展关键词:权重}字典

    answer = row["answer"]
    return calScore(question, answer, dictMerged)
コード例 #25
0
def getKeyWords(string, words=10, way=1):
    keywords = []
    if (way == 1):
        pynlpir.open()
        str = string.encode('utf-8')
        wordslist = pynlpir.get_key_words(str, words, False)
        for each in wordslist:
            # print(each)
            keywords.append(each)
    if (way == 2):
        textrank = analyse.textrank
        wordslist = textrank(string)
        for each in wordslist[0:words]:
            # print(each)
            keywords.append(each)
    return keywords
コード例 #26
0
ファイル: word.py プロジェクト: stantoxt/react-demo
def word_by_pynlpir(inputfile, word_dict, max_words=1000):

    weighted_word_list = pynlpir.get_key_words(inputfile,
                                               weighted=True,
                                               max_words=max_words)

    for word, weight in weighted_word_list:
        try:
            word_class = word_to_class(word)
            if word_class in ['time word', 'numeral', 'adverb', 'verb', \
                'locative word', 'distinguishing word']:
                continue
            if len(word) < 2: continue
            word_dict.setdefault(word, 0)
            word_dict[word] += weight
        except Exception as e:
            print("exception ", e)
コード例 #27
0
def ie():
    """
    信息熵权重
    :return:
    """
    # 获取查询
    db_tool = tool.DBtool()
    queries = db_tool.select_queries()
    segmented_queries = db_tool.select_segmented_queries()
    # 获取权重
    pynlpir.open()
    temp = {}
    for query in queries:
        temp[query['query_id']] = pynlpir.get_key_words(query['query'],
                                                        weighted=True,
                                                        max_words=100)
    pynlpir.close()
    # 组成字符串
    for query in segmented_queries:
        weight = temp[query['query_id']]
        dic = {}
        min_w = 2.0
        for w in weight:
            dic[w[0].encode('utf-8')] = w[1]
            if w[1] < min:
                min_w = w[1]

        words = query['segmented_query'].strip().split(' ')
        weight = []
        for word in words:
            if word in dic:
                weight.append(dic[word])
            else:
                weight.append(min_w / 2)
        s = sum(weight)
        weight = ['%.5f' % (x / s) for x in weight]
        # 更新数据库
        db_tool.update_weight(query['query_id'], ' '.join(weight), 'weight_ie')
    db_tool.close()
コード例 #28
0
    def post(self, request):
        obj_id = request.POST['obj_id']
        school = MySchool.objects.get(id=int(obj_id))
        feeds = []

        # weibo
        # App Key:802677147
        # App Secret:f75be23800d779cc9dbbf6b467b7ff61
        # Redirect url: https://api.weibo.com/oauth2/default.html
        # code: 4ccb7879bf204466b80e02c106d09727

        # read baidu
        params = {'keyword': school.name}

        # send a 3rd party service request
        baidu_consumer.delay(params)

        # read saved feeds
        feeds = MyBaiduStream.objects.filter(
            school=school).order_by('-last_updated')[:100]
        content = loader.get_template(self.template_name)
        tieba_html = content.render(Context({
            'obj': school,
            'feeds': feeds,
        }))

        # hot topics
        pynlpir.open()  # must have this line!
        topics = feeds[:50]
        content = loader.get_template(self.newsticker_template_name)
        newsticker_html = content.render(Context({
            'objs': topics,
            'keywords': pynlpir.get_key_words(''.join([f.name + f.description for f in feeds]), max_words=50, weighted=True)
        }))
        pynlpir.close()
        #newsticker_html = ''

        return HttpResponse(json.dumps({'bd_html': tieba_html, 'news_html': newsticker_html}),
                            content_type='application/javascript')
コード例 #29
0
ファイル: pyNLPIR.py プロジェクト: iris93/gratest
def myseg_get_keywords(filename2w, filename2seg, limitlist):
    dataMat = []
    labelMat = []
    fr = open(filename2w)
    fl = open(filename2seg, 'w')
    limits = open(limitlist)
    arrayLimits = limits.readlines()
    lengthLimits = len(arrayLimits)
    arrayOLines = fr.readlines()
    length = len(arrayOLines)
    for j in range(length):
        flag = 1
        lineArr = arrayOLines[j].strip().split(';')
        for li in range(lengthLimits):
            limitsArr = arrayLimits[li].strip().split(';')
            if str(lineArr[1]) == str(limitsArr[1]):
                flag = 0
        if flag == 0:
            pass
        else:
            if len(lineArr) < 3:
                pass
            else:
                seg = pynlpir.get_key_words(lineArr[1], weighted=True)
                fl.write(str(j))
                fl.write(";")
                fl.write(str(lineArr[1]))
                fl.write(";")
                fl.write(str(lineArr[2]))
                fl.write(";")
                for item in seg:
                    fl.write(str(item[0]))
                    fl.write(":")
                    fl.write(str(item[1]))
                    fl.write(",")
                fl.write(";\n")
    fl.close()
    pynlpir.close()
コード例 #30
0
def pynlpir():
    import pynlpir
    # read corpus
    pos_text = configReader(section='path', option='pos_text')
    fileList = getFileList(pos_text)
    corpus = [dataReader(pos_text + f, 'r+') for f in fileList]

    # pynlpir open
    pynlpir.open()
    for i in range(len(corpus)):
        fileName = ext_path + os.path.splitext(fileList[i])[0] + '_ext.txt'
        print "Writing pynlpir extraction: %s" % fileName
        try:
            keys = pynlpir.get_key_words(corpus[i], weighted=True)
        except:
            RaiseErr('extract', fileName)
        # output
        data = ["%-20s %-10.8f\n" % (key[0], key[1]) for key in keys]
        try:
            dataWriter(data, fileName, 'w')
        except:
            RaiseErr('rit_ext', fileName)
    pynlpir.close()
コード例 #31
0
ファイル: views.py プロジェクト: xiaomin0208/gkp
	def post(self,request):
		obj_id = request.POST['obj_id']
		school = MySchool.objects.get(id=int(obj_id))
		feeds = []

		# weibo
		# App Key:802677147
		# App Secret:f75be23800d779cc9dbbf6b467b7ff61		
		# Redirect url: https://api.weibo.com/oauth2/default.html
		# code: 4ccb7879bf204466b80e02c106d09727

		# read baidu
		params = {'keyword':school.name}

		# send a 3rd party service request
		baidu_consumer.delay(params)

		# read saved feeds
		feeds = MyBaiduStream.objects.filter(school=school).order_by('-last_updated')[:100]
		content = loader.get_template(self.template_name)
		tieba_html= content.render(Context({
			'obj':school,
			'feeds': feeds,
			}))

		# hot topics
		pynlpir.open() # must have this line!
		topics = feeds[:50]
		content = loader.get_template(self.newsticker_template_name)
		newsticker_html= content.render(Context({
			'objs':topics,
			'keywords': pynlpir.get_key_words(''.join([f.name+f.description for f in feeds]), max_words=50, weighted=True)
			}))
		pynlpir.close()

		return HttpResponse(json.dumps({'bd_html':tieba_html,'news_html':newsticker_html}), 
			content_type='application/javascript')
コード例 #32
0
import pynlpir

pynlpir.open()
str = "聊天机器人到底该怎么做?"
segs = pynlpir.segment(str)
segments = pynlpir.segment(str,
                           pos_tagging=True,
                           pos_names="all",
                           pos_english=False)
for seg in segs:
    pass
    #print(seg[0],'\t',seg[1])

key_words = pynlpir.get_key_words(str, weighted=True)
for key_word in key_words:
    pass
    #print(key_word[0],'\t',key_word[1])

for segment in segments:
    print(segment[0], "\t", segment[1])

k_ws = pynlpir.get_key_words(str, weighted=True)
for k_w in k_ws:
    print(k_w[0], '\t', k_w[1])
コード例 #33
0
ファイル: test_init.py プロジェクト: jiangdong123/pynlpir
 def test_issue_23(self):
     """Tests for issue #20 -- get key words with no count returned."""
     s = '我们很好,你呢'
     weighted_key_words = pynlpir.get_key_words(s, weighted=True)
     expected_weighted_key_words = [('我们', -1.00)]
     self.assertEqual(expected_weighted_key_words, weighted_key_words)
コード例 #34
0
from pynlpir import nlpir


import sys
# print sys.getdefaultencoding()
# reload(sys)
#
# sys.setdefaultencoding("utf-8")

pynlpir.open()

# pynlpir.open(encoding='utf-8')

s = 'NLPIR分词系统前身为2000年发布的ICTCLAS词法分析系统,从2009年开始,为了和以前工作进行大的区隔,并推广NLPIR自然语言处理与信息检索共' \
    '享平台,调整命名为NLPIR分词系统。'
print (s)
print (pynlpir.segment(s,pos_tagging=False))
print (pynlpir.get_key_words(s, weighted=True))

c=pynlpir.segment(s,pos_tagging=False)
c=str("|".join(c)).encode('utf-8')
print (c)



import jieba

tokens = jieba.cut(s, cut_all=False)
print ('精确模式:')
print("|".join(tokens))
コード例 #35
0
# -*- coding: utf-8 -*-
# @Time    : 2019/4/8 17:57
# @Author  : Mr.Robot
# @Site    :
# @File    : progress_segment.py
# @Software: PyCharm

import pynlpir

pynlpir.open()

if __name__ == "__main__":
    s = "服药便溏,大便呈糊状,大便日2次,胸闷胸痛,最近胸腔积液2次抽取,干咳少痰,难咯,口干欲饮,纳差,形瘦面灰,鼻准红赤,尿少色黄,有乙肝,肝硬化病史,"
    segment = pynlpir.segment(s, pos_tagging=False)
    print(segment)
    key_words = pynlpir.get_key_words(s)
    print(key_words)
    pynlpir
コード例 #36
0
ファイル: divWords.py プロジェクト: Ciciskyonefly/QIPARobot
#coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import pynlpir



pynlpir.open()
s = '聊天机器人到底该怎么做呢?'
segments = pynlpir.segment(s)
for segment in segments:
    print segment[0], '\t', segment[1]

#extracting keywords
key_words = pynlpir.get_key_words(s, weighted=True)
for key_word in key_words:
    print key_word[0], '\t', key_word[1]


##extracting all information
s = '海洋是如何形成的'
segments = pynlpir.segment(s, pos_names='all')
for segment in segments:
    print segment[0], '\t', segment[1]

pynlpir.close()
コード例 #37
0
ファイル: test.py プロジェクト: zhang-weiming/school_app
import pynlpir

if __name__ == '__main__':
    pynlpir.open()
    s = '我爱你中国'
    # segment_list = pynlpir.segment(s, pos_tagging=False)
    # for seg in segment_list:
    #     print(seg)

    key_word_list = pynlpir.get_key_words(s, max_words=10, weighted=True)
    for key_word in key_word_list:
        print(key_word[0], '\t', key_word[1])

    pynlpir.close()
コード例 #38
0
def Participle(list_datas, filename_stopwords):
    #分词
    time_start = time.time()
    print("正在分词...")

    list_garbagesT.clear()
    list_words_stop = GetListWords(filename_stopwords)

    pynlpir.open()
    for data in list_datas:
        # segments = pynlpir.segment(data.content, pos_names='all',pos_english=False)
        # file_nlp.write('\n')
        # for segment in segments:
        #     file_nlp.write("[ %s : %s ]" % (segment[0], segment[1]))
        #     file_nlp.write('\n')

        if len(data.content) < 8:
            data.error = "内容过短"
            list_garbagesT.append(data)
            continue
        list_words = pynlpir.get_key_words(data.content, max_words=70)
        if len(list_words) == 0:
            data.error = "没有分词结果"
            list_garbagesT.append(data)
            continue
        #print("开始停词")
        for word in list_words:
            if word in list_words_stop:
                #print("停了个词" + word)
                continue
            if word == '':
                data.error = "包含空白分词"
                list_garbagesT.append(data)
                break
            #统计词频
            contentT = data.content
            count = 0
            while contentT.find(word) > -1:
                contentT = contentT.replace(word, '', 1)
                count += 1
            if count == 0:
                data.error = "分词不属于原文"
                list_garbagesT.append(data)
                break
            #保存词频统计结果
            data.dict_words_tfidf[word] = count
        if len(data.dict_words_tfidf) == 0:
            data.error = "词频统计结果为空"
            list_garbagesT.append(data)
            continue

    #清除垃圾数据
    for data in list_garbagesT:
        list_datas.remove(data)
        list_garbages.append(data)
    list_garbagesT.clear()

    pynlpir.close()

    time_end = time.time()
    print("用时 : %.2f s" % (time_end - time_start))
    return list_datas
コード例 #39
0
def partition(input_path, output_path):
    '''
    分词,把input _path 里的文本文件分词,结果存在output_path
    :param input_path: 文本文件路径
    :param output_path: 分词结果的路径
    :return: 编码错误的词的错误
    '''
    f3 = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8')
    f3_name = f3.name

    stop_set = []
    f_stop_list = open(
        'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/extract_main_word/stop_list.txt',
        'r',
        encoding='utf-8')
    for line in f_stop_list:
        stop_set.append(line.split()[0])
    stop_set = set(stop_set)

    os.chdir(input_path)
    f_lst = os.listdir(os.getcwd())
    cnt1 = 0
    nlpir = pynlpir.nlpir
    pynlpir.open()
    nlpir.ImportUserDict(
        b'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/new_bigdic.txt'
    )
    for item in f_lst:
        ans_lst = []
        f = open(item, 'r', encoding='utf-8')
        s = bytes(f.read(), encoding='utf-8')
        f.close()

        size = ctypes.c_int()
        result = nlpir.ParagraphProcessA(s, ctypes.byref(size), True)
        result_t_vector = ctypes.cast(result, ctypes.POINTER(nlpir.ResultT))
        words = []

        for i in range(0, size.value):
            r = result_t_vector[i]
            word = s[r.start:r.start + r.length]
            words.append((word, r.sPOS))

        f2 = open(output_path + item, 'w', encoding='utf-8')
        for word, pos in words:
            # try:
            if word.decode('utf-8') not in stop_set:
                if pos.decode('utf-8') > b'z'.decode('utf-8') or pos.decode(
                        'utf-8').upper() == pos.decode(
                            'utf-8') and pos.decode('utf-8') != '':
                    ans_lst.append((pos.decode('utf-8'), word.decode('utf-8')))
                f2.write(
                    (word.decode('utf-8') + '  ' + pos.decode('utf-8') + '\n'))
                f3.write(
                    (word.decode('utf-8') + '  ' + pos.decode('utf-8') + '\n'))
                # except:
                #     cnt1+=1
                # else:
                #     f2.write(word.decode('utf-8') + '\n')

        keys = pynlpir.get_key_words(s, max_words=10, weighted=False)
        ans_set = list(set(ans_lst))
        feqrence = [0 for k in range(len(ans_set))]
        for k in range(len(ans_set)):
            for item in ans_lst:
                if item == ans_set[k]:
                    feqrence[k] += 1
        f2.write('\n\nMy tags: ')
        type_lst = []
        for item in ans_set:  # ans_set:  ('COMPANY_OF_INDUSTRY_56', '兴业银行')
            if item[0] not in type_lst:
                type_lst.append(item[0])
        type_lst.sort()

        ans_s = ''
        for k in range(len(type_lst)):
            ans_s += str(type_lst[k]) + ': '
            for l in range(len(ans_set)):
                if ans_set[l][0] == type_lst[k]:
                    # 这里插入一个函数,来表示股票与基金间的关系
                    ans_s += stock2fund(ans_set, feqrence, l)
                    # ans_s+=' ('+str(ans_set[l][1])+': '+str(feqrence[l])+')'
            ans_s += '\n'
        f2.write(ans_s)
        f2.write('\n\nkeyword: ')

        # 这里是在数分词器给出的关键词词频
        keys_f = [0 for l in range(len(keys))]

        commen_last_name = [
            '王', '李', '张', '刘', '陈', '杨', '黄', '赵', '吴', '周', '徐', '孙', '马',
            '朱', '胡', '郭', '何', '高', '林', '郑', '谢', '罗', '梁', '宋', '唐', '许',
            '韩', '冯', '邓', '曹', '彭', '曾', '蕭', '田', '董', '袁', '潘', '于', '蒋',
            '蔡', '余', '杜', '叶', '程', '苏', '魏', '吕', '丁', '任', '沈', '姚', '卢',
            '姜', '崔', '钟', '谭', '陆', '汪', '范', '金', '石', '廖', '贾', '夏', '韦',
            '付', '方', '白', '邹', '孟', '熊', '秦', '邱', '江', '尹', '薛', '闫', '段',
            '雷', '侯', '龙', '史', '陶', '黎', '贺', '顾', '毛', '郝', '龚', '邵', '万',
            '钱', '严', '覃', '武', '戴', '莫', '孔', '向', '汤'
        ]
        ans3 = ''

        f3.seek(0)
        for line in f3:
            if len(line.split()) == 2:
                name = line.split()[0]
                pos = line.split()[1]
                for l in range(len(keys)):
                    if name == keys[l]:
                        keys_f[l] += 1
                if name[0] in commen_last_name and name not in [
                        '万元', '周一', '周二', '周三', '周四', '周五', '周六', '周日', '周天'
                ] and len(name) in [2, 3] and pos == 'nr':
                    ans3 += '  ' + name

        ans2 = ''
        for l in range(len(keys)):
            ans2 += str(keys[l]) + ': ' + str(keys_f[l]) + '  '

        f2.write(ans2)
        f2.write('\n\nRelated person: ' + ans3)
        f2.close()

    pynlpir.close()
    return cnt1
コード例 #40
0
ファイル: word.py プロジェクト: zhangli344236745/bigdata
__author__ = 'gohper'
# -*- coding:utf-8 -*-

import pynlpir

pynlpir.open()

s = "聊天机器人到底该怎么做呢?"
segs = pynlpir.segment(s)
for seg in segs:
    print seg[0], '\t', seg[1]

print("_____")

s1 = "海洋是如何形成的"
segs = pynlpir.segment(s1, pos_names='all')
for seg in segs:
    print seg[0], '\t', seg[1]

print("_________")
key_words = pynlpir.get_key_words(s, weighted=True)
for key_word in key_words:
    print key_word[0], '\t', key_word[1]

pynlpir.close()
コード例 #41
0
def extract_tags(s, topK=5, weighted=False):
    return pynlpir.get_key_words(s, topK, weighted)