def bosonnlp_segmentation(str_list):
    nlp = BosonNLP('NBSC61pl.10304.Fnwc_rUz9fyw')
    result = nlp.tag(str_list)
    for tag_map in result:
        word_tokens = tag_map['word']
        for word in word_tokens:
            print word.encode("utf-8") + "|",
        print "\n"
Esempio n. 2
0
File: nlp.py Progetto: polarbird/ai
class YoNLP:
    def __init__(self, boson_api_token):
        self._nlp = BosonNLP(boson_api_token)

    def sentiment(self, contents):
        return self._nlp.sentiment(contents)

    def tag(self, contents):
        return self._nlp.tag(contents)
Esempio n. 3
0
class CNSegment:
    """
    封装分词工具。
    使用bosonnlp提供API
    """

    #停用词表
    stopwords = []

    def __init__(self):
        self.nlp=BosonNLP(bosonkey)

    def get_tags(self,sentences):
        """
        获取分词
        :param sentences:分词的句子或者句子list
        :return: 分词结果list
        """
        result= self.nlp.tag(sentences)
        return result

    def denoisingOne(self,tagdict , uTag = None,useStopWord = False):
        """通过词性和停用词去除噪声

            :param  tagList : 分词过后得到的列表
            :param  uTag : 需要去噪的词性标记列表,默认为('w','o','y','u')
            :return: 分词结果list
            """
        if (uTag):
            uselessTag = uTag
        else:
            uselessTag = ('w', 'o', 'y', 'u')
        tagdict
        word_list = []
        for index, it in enumerate(tagdict['tag']):
            if it[0] not in uselessTag:
                if not useStopWord:
                    word_list.append(tagdict['word'][index])
                elif tagdict['word'][index] not in self.stopwords:
                    word_list.append(tagdict['word'][index])
        return word_list

    def cut(self,sentences):
        """
        分词
        :param sentences:需要分词的语料集
        :return: 去噪后的单词list
        """
        tags=self.get_tags(sentences)
        cutedSentences=[]
        for sentence in tags:
            cutedSentences.append(self.denoisingOne(sentence))
        return cutedSentences

    def depenPars(self,sentences):
        return self.nlp.depparser(sentences)
Esempio n. 4
0
def getAnswerNounKeys(text_set, api_key):
	nlp = BosonNLP(api_key)
	result = nlp.tag(text_set)
	words = ''
	for d in result:
		for it in zip(d['word'], d['tag']):
			if it[1] == 'n':
				words += it[0]
			# print(' '.join([ '%s/%s' % it]))
	return getAnswerKeys(words, api_key)
Esempio n. 5
0
def segment_tag(text):
    nlp = BosonNLP('2DgGSC-8.33497.8yeNchBP6L9n')
    result = nlp.tag(text)

    words = result[0]['word']
    tags = result[0]['tag']

    assert len(words) == len(tags)

    return words, tags
Esempio n. 6
0
def getAnswerNounKeys(text_set, api_key):
    nlp = BosonNLP(api_key)
    result = nlp.tag(text_set)
    words = ''
    for d in result:
        for it in zip(d['word'], d['tag']):
            if it[1] == 'n':
                words += it[0]
            # print(' '.join([ '%s/%s' % it]))
    return getAnswerKeys(words, api_key)
Esempio n. 7
0
    def _boson_seg(self, text):
        nlp = BosonNLP('g8lQg9Mv.25818.fAbbwt6TYhh8')
        if type(text) == str:
            text = [text]

        corpus_len = len(text)
        word, tag = [], []
        for idx in range(corpus_len // 100 + 1):
            curr_idx = idx * 100
            result = nlp.tag(text[curr_idx:min(curr_idx + 100, corpus_len)])
            for seg in result:
                word.append(seg['word'])
                tag.append(seg['tag'])

        return word
Esempio n. 8
0
def ScentenceSimilar(str1, str2):
    """得到str1和str2的相似度,使用余弦相似性计算。
    采用bosonnlp分词;联网使用。

    """

    nlp = BosonNLP('wx3Ua05Y.21658.Ch876jBfuqIH')

    #获取分词结果
    tags1 = nlp.tag(str1.lower())
    tags2 = nlp.tag(str2.lower())

    tfdict1 = getTFdict(Denoising(tags1[0]))
    tfdict2 = getTFdict(Denoising(tags2[0]))

    return getSimilar_by_cos(tfdict1, tfdict2)
Esempio n. 9
0
def words_cut(txt_lines, isJieba=True):  #分词,返回列表
    text_cut = []
    if isJieba:
        for line in txt_lines:
            line = line.strip()  #去除空白符
            seg_line = cut(line)  #返回的是生成器,只可遍历一遍
            line_str = " ".join(seg_line) + "\n"
            text_cut.append(line_str)
        return text_cut

    nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8')
    for line in txt_lines:
        line_list = nlp.tag(line)[0][
            'word']  #分词,返回一个嵌套的列表格式为[{'word':[分好的词], ''}]
        line_str = " ".join(line_list) + '\n'  #将列表连接为字符串
        text_cut.append(line_str)
    return text_cut
Esempio n. 10
0
def Text_Segmentation_5_1():
    input_txt = open('static/files/方滨兴_互动百科.txt', 'r', encoding='utf-8')
    # 有的文件编码使用GBK形式,在读文件时需要再添加一个参数:encoding='utf-8'
    # 有的记事本文件编码使用ANSI,读文件添加encoding='utf-8'反而会报错

    lines = input_txt.readlines()
    input_txt.close()

    for line in lines:
        nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8')
        result = nlp.tag(line)[0]['word']
        output_txt = open('static/files/方滨兴_互动百科_split_unattributed.txt',
                          mode='a',
                          encoding='utf-8')
        # output_txt.write('{}\n'.format(result))             # 以列表字符串的形式写入
        output_txt.write('{}\n'.format(' '.join(result)))  # 以纯文本的形式写入
        output_txt.close()
Esempio n. 11
0
def words_cut(filename, isJieba=True):  #分词,返回列表
    text_cut = []

    if isJieba:
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()  #去除空白符
                seg_line = cut(line)  #返回的是生成器,只可遍历一遍
                line_str = " ".join(seg_line) + "\n"
                text_cut.append(line_str)
        return text_cut

    nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8')
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            line_list = nlp.tag(line)[0][
                'word']  #分词,返回一个嵌套的列表格式为[{'word':[分好的词], ''}]
            line_str = " ".join(line_list) + '\n'  #将列表连接为字符串
            text_cut.append(line_str)
    return text_cut
Esempio n. 12
0
class Scanner(object):
    """
    bosonnlp 中文分词
    """
    def __init__(self):
        self.nlp_handler = BosonNLP(API_TOKEN_BOSONNLP)

    def get_tag(self, content, remove_punctuations=False):
        """
        分词后的结果,返回的是每个词的列表
        """
        result = self.nlp_handler.tag(content)[0]
        if remove_punctuations:
            return [
                x for x, y in zip(result['word'], result['tag']) if y[0] != 'w'
            ]
        return result['word']

    def get_key_word(self, content, segmented=False):
        """提取关键词"""
        keywords = self.nlp_handler.extract_keywords(content, 2, segmented)
        firstkey, secondkey = keywords[0], keywords[1]
        return firstkey[1] if (firstkey[0] - secondkey[0]) > 0.3\
            else ' '.join([firstkey[1], secondkey[1]])
Esempio n. 13
0
    # '纪检部门仍在调查之中。成都商报记者 姚永忠']
    # result = nlp.ner(s)
    # print result
    # print ' '.join([x for x in result[0]['word']])

    fname = 'D:\\Github\\Sentiment-Analysis\\data\\nlpcc_emotion\\train\\neg_raw'
    all_texts = [x.strip() for x in open(fname).readlines()]
    for i in range(7000):
        print "handing "+str(i+1)+"th 100 documents....."
        start = i*100
        end = start+100
        if start >= len(all_texts):
            break
        texts = all_texts[start:end]
        # 连续空格只保留1个 繁体转化为简体 新词枚举强度设为3(较强)特殊字符不进行转化
        result = nlp.tag(texts, space_mode=1, oov_level=3, t2s=1, special_char_conv=0)
        f1 = open(fname + '_fenci', 'a')
        f2 = open(fname + '_pos', 'a')
        f3 = open(fname + '_cobine', 'a')
        for d in result:
            fenci_text = ' '.join([x.encode('utf8') for x in d['word']])
            pos_text = ' '.join(['tag_'+x.encode('utf8') for x in d['tag']])
            cobine_text = ' '.join([x.encode('utf8') + '/' + y.encode('utf8') for x, y in zip(d['word'], d['tag'])])
            f1.write(fenci_text + '\n')
            f2.write(pos_text + '\n')
            f3.write(cobine_text + '\n')
        f1.close()
        f2.close()
        f3.close()
    print 'over'
Esempio n. 14
0
# -*- encoding: utf-8 -*-
from bosonnlp import BosonNLP
import os

#reference from http://bosonnlp-py.readthedocs.io/#bosonnlp-py

nlp = BosonNLP('bosonnlp的API')
# or nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
nlp.ner('你好啊', sensitivity=2)
nlp.ner(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休'])
result = nlp.tag('成都商报记者 姚永忠')
format_tag_result(result[0])
result = nlp.tag(['亚投行意向创始成员国确定为57个', '“流量贵”频被吐槽'], oov_level=0)
result = nlp.tag("成都商报记者 姚永忠", space_mode=2)
Esempio n. 15
0
class _BosonNLPWrapper(object):
    """
    NLP object using the BosonNLP API Python SDK.
    """

    news_categories = ['physical education', 'education', 'finance', 'society', 'entertainment', 'military',
                       'domestic', 'science and technology', 'the internet', 'real estate', 'international',
                       'women', 'car', 'game']

    def __init__(self, api_token=None):
        try:
            assert api_token is not None, "Please provide an API token"
        except AssertionError as e:
            raise

        self.token = api_token
        self.nlp = BosonNLP(self.token)


    def get_sentiment(self, text):
        """
        Performs sentiment analysis on a text passage (works for Chinese text).
        See: http://docs.bosonnlp.com/sentiment.html

        Parameters
        ----------
        text (string): text passage to be analyzed for sentiment


        Returns
        -------
        dictionary with 'positive' and 'negative' as keys with their respective weights as values

        >>> nlp = BosonNLPWrapper('')
        >>> nlp.get_sentiment('不要打擾我')
        {'positive': 0.3704911989140307, 'negative': 0.6295088010859693}
        >>> nlp.get_sentiment('我很高興跟你見面')
        {'positive': 0.856280735624867, 'negative': 0.14371926437513308}
        """
        pos, neg = self.nlp.sentiment(text)[0]

        return {'positive': pos, 'negative': neg}


    def classify_news(self, text):
        """
        Classifies news text into 14 different categories.
        See: http://docs.bosonnlp.com/classify.html

        Parameters
        ----------
        text (string): text passage to classify into news categories defined in news_categories

        Returns
        -------
        one of the 14 categories in news_categories that the text was classified into
        """
        numbering = range(len(_BosonNLPWrapper.news_categories))
        cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories))

        clsfy_num = self.nlp.classify(text)[0]

        return cats_dict[clsfy_num]


    def extract_keywords(self, text, top_k=3):
        """
        Extracts the top k keywords and the weight of each word in the text.
        See: http://docs.bosonnlp.com/keywords.html

        Parameters
        ----------
        text (string): text passage from which to extract keywords
        top_k (integer): number of keywords to return

        Returns
        -------
        list of key-value pairs {word: weight}


        >>> nlp = BosonNLPWrapper('')
        >>> nlp.extract_keywords('我最愛老虎堂,奶茶香醇,波霸彈Q 好香的黑糖味')
        [{'波霸彈': 0.5980681967308248}, {'黑糖': 0.4699792421671365}, {'香醇': 0.4497614275300947}]
        """
        result = self.nlp.extract_keywords(text, top_k)  # outputs in sorted order of weight

        return [{result[i][1]: result[i][0]} for i in range(len(result))]


    def segment_words_and_tag(self, text):
        """
        Splits up text into segments of "words" and tags them with their respective part of speech.
        See: http://docs.bosonnlp.com/tag.html

        Parameters
        ----------
        text (string): text passage to segment into separate "words" and tags them with parts of speech

        Returns
        -------
        list of key-value pairs {word: part-of-speech-tag}
        """
        result = self.nlp.tag(text)[0]
        words = result['word']
        tags = result['tag']

        return [{words[i]: tags[i]} for i in range(len(words))]


    def get_summary(self, content, title='', pct_limit=0.2):
        """
        Extracts a new digest (summary) of the content.
        See: http://docs.bosonnlp.com/summary.html

        Parameters
        ----------
        text (string): text passage to summarize
        title (string): title of the passage (optional, may provide more accurate results)
        pct_limit (float): max length of the summary in terms of percentage of the original word count

        Returns
        -------
        string containing the summary of the passage
        """
        summary = self.nlp.summary(title, content, pct_limit)

        return summary
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
from bosonnlp import BosonNLP
import json

file_path = r"C:\workspace\Taikor_NLP_service\Thirdparty_NLP_WebAPI\Bosson\corpos\msr_test.txt"
with open(file_path, "r", encoding="utf8") as f:
    s = f.read()

nlp = BosonNLP("2ZmFSLeL.3212.Y6W7eOViuyZZ")
pos = nlp.tag(s)

dump = json.dumps(pos)

with open("pos", "w") as f:
    f.write(dump)
Esempio n. 17
0
class C2E(object):
    """
    """
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"),
                       self.opt.seprator, None, None)

        self.translator = onmt.Translator(opt)

        self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S")

    def seg(self, doc):
        res = ""
        try:
            print "using boson....."
            boson_res = self.nlp.tag(l)
            res = boson[0]['word']
        except:
            res = jieba.cut(doc, cut_all=False)
        return " ".join(res)

    def truecase(self, text):
        text = text.encode('utf-8')
        truecase_sents = []
        tagged_sent = nltk.pos_tag(
            [word.lower() for word in nltk.word_tokenize(text)])
        normalize_sent = [
            w.captitalize() if t in ['NN', 'NNS'] else w
            for (w, t) in tagged_sent
        ]
        normalize_sent[0] = normalize_sent[0].capitalize()
        pretty_string = re.sub(" (?=[\.,'!?:;])", "",
                               ' '.join(normalized_sent))
        return pretty_string

    def tokenDoc(self, doc):
        doc = doc.strip()
        sentenceList = re.split(PAT, doc.decode('utf-8'))
        assert len(sentenceList) >= 1
        if sentenceList[-1].strip() == "":
            sentenceList = sentenceList[:-1]
        punctuaList = re.findall(PAT, doc.decode('utf-8'))
        punctuaList += (len(sentenceList) - len(punctuaList)) * [' ']
        sents = [
            sent + punc for (sent, punc) in zip(sentenceList, punctuaList)
        ]
        sents = [sent.strip() for sent in sents]
        print 'c2e sentenceList : ', sentenceList
        tokens = []
        for sent in sents:
            sent = sent.lower()
            #sent = self.detokenizer.unescape_xml(self.tokenizer.tokenize(sent, return_str=True))
            sent = self.seg(sent)
            if self.opt.bpe_codes != "":
                sent = self.bpe.segment(sent).strip()
            token = sent.split()
            tokens += [token]
        print 'c2e tokens : ', tokens
        return tokens

    def translate(self, doc):
        batch = self.tokenDoc(doc)
        pred, _, _, _, _ = self.translator.translate(batch, None)
        rstr = ""
        for idx in range(len(pred)):
            pred_sent = ' '.join(pred[idx][0]).replace(' @-@ ', '-').replace(
                self.sep, '')
            #pred_sent = self.truecase(pred_sent)
            pred_sent = pred_sent.capitalize()
            rstr += pred_sent + "\n"
        print 'c2e rstr : ', rstr.strip()
        return rstr.strip()
Esempio n. 18
0
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
from bosonnlp import BosonNLP


def stopwordlist():
    stopwords = [
        line.strip()
        for line in open('G:\python\_files\error_dc\ChangeStopWords.txt',
                         encoding='UTF-8').readlines()
    ]
    return stopwords


word_list = []
fR = open('G:\pycharm\DataSet\_1\_train_small.csv', 'r', encoding='utf-8')
nlp = BosonNLP('your API token')
sent = fR.read()
sent_list = nlp.tag(sent)
for t in sent_list:
    if t not in stopwordlist():
        word_list.append(t)
fW = open('G:\python\out\_test_jieba_big_02.csv', 'w')
fW.write(' '.join(word_list))

fR.close()
fW.close()
Esempio n. 19
0
news_dir = '/home/ewan/PycharmProjects/news_spider/news/'
file_list = os.listdir(news_dir)

nlp = BosonNLP('FuHSE7Vf.13924.jadflTdrQLWx')
splitted_titles = []

SIZE = len(file_list)
count = 0

stop_tags = ['w', 't', 'q', 'u', 'k', 'h', 'o', 'y', 'c', 'p', 'd', 'r']

for title in file_list:
    count += 1
    if count % (SIZE / 100) == 0:
        print count
    result = nlp.tag(title)
    words = ''
    for index, word in enumerate(result[0]['word']):
        stop = False
        for tag in stop_tags:
            if tag in result[0]['tag'][index]:
                stop = True
                break
        if stop is False:
            words += word + ' '

    splitted_titles.append(words)


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
Esempio n. 20
0
class _BosonNLPWrapper(object):
    """
    NLP object using the BosonNLP API Python SDK.
    """

    news_categories = [
        'physical education', 'education', 'finance', 'society',
        'entertainment', 'military', 'domestic', 'science and technology',
        'the internet', 'real estate', 'international', 'women', 'car', 'game'
    ]

    def __init__(self, api_token=None):
        try:
            assert api_token is not None, "Please provide an API token"
        except AssertionError as e:
            raise

        self.token = api_token
        self.nlp = BosonNLP(self.token)

    def get_sentiment(self, text):

        pos, neg = self.nlp.sentiment(text)[0]

        return {'positive': pos, 'negative': neg}

    def classify_news(self, text):

        numbering = range(len(_BosonNLPWrapper.news_categories))
        cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories))

        clsfy_num = self.nlp.classify(text)[0]

        return cats_dict[clsfy_num]

    def extract_keywords(self, text, top_k=3):

        result = self.nlp.extract_keywords(
            text, top_k)  # outputs in sorted order of weight

        return [{result[i][1]: result[i][0]} for i in range(len(result))]

    def segment_words_and_tag(self, text):
        """
        Splits up text into segments of "words" and tags them with their respective part of speech.
        See: http://docs.bosonnlp.com/tag.html

        Parameters
        ----------
        text (string): text passage to segment into separate "words" and tags them with parts of speech

        Returns
        -------
        list of key-value pairs {word: part-of-speech-tag}
        """
        result = self.nlp.tag(text)[0]
        words = result['word']
        tags = result['tag']

        return [{words[i]: tags[i]} for i in range(len(words))]

    def get_summary(self, content, title='', pct_limit=0.2):
        """
        Extracts a new digest (summary) of the content.
        See: http://docs.bosonnlp.com/summary.html

        Parameters
        ----------
        text (string): text passage to summarize
        title (string): title of the passage (optional, may provide more accurate results)
        pct_limit (float): max length of the summary in terms of percentage of the original word count

        Returns
        -------
        string containing the summary of the passage
        """
        summary = self.nlp.summary(title, content, pct_limit)

        return summary
Esempio n. 21
0
# -*- coding: utf-8 -*-
import os
import sys
import datetime
import time
from bosonnlp import BosonNLP

myApiToken = "X0njNWj2.5612.pYnhvqV02Kgn"
nlp = BosonNLP(myApiToken)
for eachLine in open("simple.txt"):
    # print eachLine,type(eachLine)
    # break
    # print nlp.extract_keywords(eachLine)
    result = nlp.tag(eachLine)
    print result
# print nlp.sentiment("这家味道还不错")
# print nlp.extract_keywords("instructor.txt")
### 1. 基本用法

from bosonnlp import BosonNLP

words_list = list()

nlp = BosonNLP('g8lQg9Mxx.25818.fAbbwt6TYhh8') # 使用token
result = nlp.tag('承德市长江大桥')

print(result)
print(result[0]['word'])
print(result[0]['tag'])

for i in range(len(result[0]['word'])):
    print(result[0]['word'][i] + '/' + result[0]['tag'][i], end=' ')
print()

print(' '.join([a + '/' + b for a, b in zip(result[0]['word'], result[0]['tag'])]))

### 2. 一次处理5篇文章的方法

from bosonnlp import BosonNLP
import requests

tokens = ['g8lQxxMv.25818.fAbbwt6TYhh8',] #boson api token

# Check Usage Time
HEADERS = {'X-Token': tokens[0]}
RATE_LIMIT_URL = 'http://api.bosonnlp.com/application/rate_limit_status.json'
result = requests.get(RATE_LIMIT_URL, headers=HEADERS).json()
canUseTime = result['limits']['tag']['count-limit-remaining']
Esempio n. 23
0
def tag(entity):
    nlp = BosonNLP(boson_token)
    result = nlp.tag(entity)
    for d in result:
        print(' '.join(['%s/%s' % it for it in zip(d['word'], d['tag'])]))
    return result
Esempio n. 24
0
# coding: utf-8
"""
function:利用bosonnlp的api实现文本的分词和词性标注
token:***************************
author:[email protected]
"""
from bosonnlp import BosonNLP
import os
nlp = BosonNLP('****************************')
f = open("NBA.txt", "r")  # 读取文本
string = f.read().decode("utf-8")
result = nlp.tag(string)
# 完整的参数调用格式如下:
# result = nlp.tag(s, space_mode=0, oov_level=3, t2s=0, special_char_conv=0)
# 修改space_mode选项为1,如下:
# result = nlp.tag(s, space_mode=1, oov_level=3, t2s=0, special_char_conv=0)
# 修改oov_level选项为1,如下:
# result = nlp.tag(s, space_mode=0, oov_level=1, t2s=0, special_char_conv=0)
# 修改t2s选项为1,如下:
# result = nlp.tag(s, space_mode=0, oov_level=3, t2s=1, special_char_conv=0)
# 修改特殊字符转换选项为1,如下:
# result = nlp.tag(s, space_mode=0, oov_level=3, t2s=0, special_char_conv=1)
f.close()
for d in result:
    print(' '.join(d['word']))  # 分词结果
for t in result:
    for word, tag in zip(t['word'], t['tag']):
        print word + " " + tag  # 词性标注结果
Esempio n. 25
0
    if html.has_key('error'):
        return 0
    return float(html['similarity'])


def hownet_sentence_sim(s1, s2):
    data = {'apiKey': "vpze450m", 'text1': s1, 'text2': s2}
    url = 'http://yuzhinlp.com/api/getShortSimilarityApi.do'
    html = requests.post(url, data).text
    s = requests.session()
    s.keep_alive = False
    html = json.loads(html, encoding='utf-8')
    if html.has_key('error'):
        return 0
    return float(html['success'])


if __name__ == "__main__":
    scentences1 = readFFile(r"testSet/trainSet.txt")
    scentences2 = readFFile(r"testSet/testSet1.txt")

    nlp = BosonNLP('wx3Ua05Y.21658.Ch876jBfuqIH')

    # 获取分词结果
    tags1 = nlp.tag(scentences1[16])
    tags2 = nlp.tag(scentences2)
    wordA = Denoising(tags1[16])
    wordB = Denoising(tags2[16])

    print hownet_sentence_sim(scentences1[16], scentences2[16])
Esempio n. 26
0
# -*- encoding: utf-8 -*-
from __future__ import print_function, unicode_literals
from bosonnlp import BosonNLP

input_txt = open('邬贺铨_搜狗百科.txt', 'r', encoding='utf-8')
# 有的文件编码使用GBK形式,在读文件时需要再添加一个参数:encoding='utf-8'
# 有的记事本文件编码使用ANSI,读文件添加encoding='utf-8'反而会报错

lines = input_txt.readlines()
for line in lines:
    nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8')
    result = nlp.tag(line)[0]['word']
    output_txt = open('邬贺铨_搜狗百科_split_unattributed.txt',
                      mode='a',
                      encoding='utf-8')
    # output_txt.write('{}\n'.format(result))             # 以列表字符串的形式写入
    output_txt.write('{}\n'.format(' '.join(result)))  # 以纯文本的形式写入
    output_txt.close()

# # 注意:在测试时请更换为您的API token。
# nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8')
# s = '游戏很喜欢,希望赶紧过了无法打开家园这一块,要不买之前给哥提醒也可以,买完之后告诉玩家进不去要等审核有一点小生气。'
# result = nlp.tag(s)[0]['word']
# print(' '.join(result))
# print(result)

# 注意:在测试时请更换为您的API token。
# nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8')
#
# s = ['亚投行意向创始成员国确定为57个', '“流量贵”频被吐槽']
#
def load_data(filepath):
    with open(filepath + 'train.json', 'r',
              encoding='utf-8') as data_train, open(
                  filepath + 'test.json', 'r',
                  encoding='utf-8') as data_test, open(
                      filepath + 'data_train.txt', 'w+',
                      encoding='utf-8') as train_data, open(
                          filepath + 'data_test.txt', 'w+',
                          encoding='utf-8') as test_data:
        nlp = BosonNLP('sPB-JflO.34520.7EXOGbw_13LD')
        i = 0
        for item in jl.Reader(data_train):
            # sentence = nlp.tag(item['fact'])[0]['word']
            relevant_articles = item["meta"]["relevant_articles"]
            if len(relevant_articles) >= 2:
                continue
            else:
                relevant_articles = relevant_articles[0]
            accusation = item['meta']['accusation']
            if len(accusation) >= 2:
                continue
            else:
                accusation = accusation[0]
            sentence = nlp.tag(item['fact'])[0]['word']
            imprisonment = item['meta']['term_of_imprisonment']['imprisonment']
            if imprisonment > 180:
                continue
            death_penalty = item['meta']['term_of_imprisonment'][
                'death_penalty']
            life_imprisonment = item['meta']['term_of_imprisonment'][
                'life_imprisonment']
            if (death_penalty is True) or (life_imprisonment is True):
                train_data.write(' '.join(sentence) + '     ' +
                                 str(relevant_articles) + '     ' +
                                 accusation + '     ' + str(400) + '\n')
            else:
                train_data.write(' '.join(sentence) + '     ' +
                                 str(relevant_articles) + '     ' +
                                 accusation + '     ' + str(imprisonment) +
                                 '\n')
            i += 1
            print(i)
        j = 0
        for item in jl.Reader(data_test):
            # sentence = nlp.tag(item['fact'])[0]['word']
            relevant_articles = item["meta"]["relevant_articles"]
            if len(relevant_articles) >= 2:
                continue
            else:
                relevant_articles = relevant_articles[0]
            accusation = item['meta']['accusation']
            if len(accusation) >= 2:
                continue
            else:
                accusation = accusation[0]
            sentence = nlp.tag(item['fact'])[0]['word']
            imprisonment = item['meta']['term_of_imprisonment']['imprisonment']
            if imprisonment > 180:
                continue
            death_penalty = item['meta']['term_of_imprisonment'][
                'death_penalty']
            life_imprisonment = item['meta']['term_of_imprisonment'][
                'life_imprisonment']
            if (death_penalty is True) or (life_imprisonment is True):
                test_data.write(' '.join(sentence) + '     ' +
                                str(relevant_articles) + '     ' + accusation +
                                '     ' + str(400) + '\n')
            else:
                test_data.write(' '.join(sentence) + '     ' +
                                str(relevant_articles) + '     ' + accusation +
                                '     ' + str(imprisonment) + '\n')
            j += 1
            print(j)
Esempio n. 28
0
class BosonNlpp:
    def __init__(self):
        self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB')

    #情感分析
    def testSentiment(self, s):
        result = self.bonlp.sentiment(s)
        return result
        #print(result)

    #命名实体识别
    def lexicalAnalysis(self, s):
        result = self.bonlp.ner(s)[0]
        return result

    #依存文法分析
    def textDependency(self, s):
        result = self.bonlp.depparser(s)
        return result

    #关键词提取
    def testKeywords(self, s):
        result = self.bonlp.extract_keywords(s, top_k=10)
        return result

    #新闻分类
    def textClassify(self, s):
        resultlist = self.bonlp.classify(s)
        classifys = {
            0: '体育',
            1: '教育',
            2: '财经',
            3: '社会',
            4: '娱乐',
            5: '军事',
            6: '国内',
            7: '科技',
            8: '互联网',
            9: '房产',
            10: '国际',
            11: '女人',
            12: '汽车',
            13: '游戏'
        }
        return (classifys[resultlist[0]])

    #语义联想
    def lexicalSynonym(self, term):
        result = self.bonlp.suggest(term, top_k=10)
        return result

    #分词与词性标注
    def fenci(self, s):
        result = self.bonlp.tag(s)
        return result

    def newssubstract(self, s):
        #s=s.encode('utf8')
        s = s.decode('utf-8')
        result = self.bonlp.summary('', s)
        return result
Esempio n. 29
0
 def Text_Seg_By_BosonNLP(line):
     nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8')
     words = nlp.tag(line)[0]['word']
     # output_txt.write('{}\n'.format(result))             # 以列表字符串的形式写入
     # seg_words = ' '.join(result)   # 以纯文本的形式写入
     return words
Esempio n. 30
0
with open('news.pkl', 'rb') as f:
    news = pickle.load(f)

nlp = BosonNLP('FuHSE7Vf.13924.jadflTdrQLWx')
splitted_news = {}

SIZE = len(news)
count = 0

green_tags = ['n', 's', 'v']

for key, value in news.items():
    count += 1
    if count % (SIZE / 100) == 0:
        print count
    result = nlp.tag([value])
    words = ''
    for index, word in enumerate(result[0]['word']):
        green = False
        for tag in green_tags:
            if tag in result[0]['tag'][index]:
                green = True
                break
        if green:
            words += word + ' '

    # print words
    # splitted_news[key] = re.split(ur'\s+', words)
    # print sys.getdefaultencoding()
    splitted_news[key] = words