def bosonnlp_segmentation(str_list): nlp = BosonNLP('NBSC61pl.10304.Fnwc_rUz9fyw') result = nlp.tag(str_list) for tag_map in result: word_tokens = tag_map['word'] for word in word_tokens: print word.encode("utf-8") + "|", print "\n"
class YoNLP: def __init__(self, boson_api_token): self._nlp = BosonNLP(boson_api_token) def sentiment(self, contents): return self._nlp.sentiment(contents) def tag(self, contents): return self._nlp.tag(contents)
class CNSegment: """ 封装分词工具。 使用bosonnlp提供API """ #停用词表 stopwords = [] def __init__(self): self.nlp=BosonNLP(bosonkey) def get_tags(self,sentences): """ 获取分词 :param sentences:分词的句子或者句子list :return: 分词结果list """ result= self.nlp.tag(sentences) return result def denoisingOne(self,tagdict , uTag = None,useStopWord = False): """通过词性和停用词去除噪声 :param tagList : 分词过后得到的列表 :param uTag : 需要去噪的词性标记列表,默认为('w','o','y','u') :return: 分词结果list """ if (uTag): uselessTag = uTag else: uselessTag = ('w', 'o', 'y', 'u') tagdict word_list = [] for index, it in enumerate(tagdict['tag']): if it[0] not in uselessTag: if not useStopWord: word_list.append(tagdict['word'][index]) elif tagdict['word'][index] not in self.stopwords: word_list.append(tagdict['word'][index]) return word_list def cut(self,sentences): """ 分词 :param sentences:需要分词的语料集 :return: 去噪后的单词list """ tags=self.get_tags(sentences) cutedSentences=[] for sentence in tags: cutedSentences.append(self.denoisingOne(sentence)) return cutedSentences def depenPars(self,sentences): return self.nlp.depparser(sentences)
def getAnswerNounKeys(text_set, api_key): nlp = BosonNLP(api_key) result = nlp.tag(text_set) words = '' for d in result: for it in zip(d['word'], d['tag']): if it[1] == 'n': words += it[0] # print(' '.join([ '%s/%s' % it])) return getAnswerKeys(words, api_key)
def segment_tag(text): nlp = BosonNLP('2DgGSC-8.33497.8yeNchBP6L9n') result = nlp.tag(text) words = result[0]['word'] tags = result[0]['tag'] assert len(words) == len(tags) return words, tags
def _boson_seg(self, text): nlp = BosonNLP('g8lQg9Mv.25818.fAbbwt6TYhh8') if type(text) == str: text = [text] corpus_len = len(text) word, tag = [], [] for idx in range(corpus_len // 100 + 1): curr_idx = idx * 100 result = nlp.tag(text[curr_idx:min(curr_idx + 100, corpus_len)]) for seg in result: word.append(seg['word']) tag.append(seg['tag']) return word
def ScentenceSimilar(str1, str2): """得到str1和str2的相似度,使用余弦相似性计算。 采用bosonnlp分词;联网使用。 """ nlp = BosonNLP('wx3Ua05Y.21658.Ch876jBfuqIH') #获取分词结果 tags1 = nlp.tag(str1.lower()) tags2 = nlp.tag(str2.lower()) tfdict1 = getTFdict(Denoising(tags1[0])) tfdict2 = getTFdict(Denoising(tags2[0])) return getSimilar_by_cos(tfdict1, tfdict2)
def words_cut(txt_lines, isJieba=True): #分词,返回列表 text_cut = [] if isJieba: for line in txt_lines: line = line.strip() #去除空白符 seg_line = cut(line) #返回的是生成器,只可遍历一遍 line_str = " ".join(seg_line) + "\n" text_cut.append(line_str) return text_cut nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8') for line in txt_lines: line_list = nlp.tag(line)[0][ 'word'] #分词,返回一个嵌套的列表格式为[{'word':[分好的词], ''}] line_str = " ".join(line_list) + '\n' #将列表连接为字符串 text_cut.append(line_str) return text_cut
def Text_Segmentation_5_1(): input_txt = open('static/files/方滨兴_互动百科.txt', 'r', encoding='utf-8') # 有的文件编码使用GBK形式,在读文件时需要再添加一个参数:encoding='utf-8' # 有的记事本文件编码使用ANSI,读文件添加encoding='utf-8'反而会报错 lines = input_txt.readlines() input_txt.close() for line in lines: nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8') result = nlp.tag(line)[0]['word'] output_txt = open('static/files/方滨兴_互动百科_split_unattributed.txt', mode='a', encoding='utf-8') # output_txt.write('{}\n'.format(result)) # 以列表字符串的形式写入 output_txt.write('{}\n'.format(' '.join(result))) # 以纯文本的形式写入 output_txt.close()
def words_cut(filename, isJieba=True): #分词,返回列表 text_cut = [] if isJieba: with open(filename, 'r', encoding='utf-8') as f: for line in f.readlines(): line = line.strip() #去除空白符 seg_line = cut(line) #返回的是生成器,只可遍历一遍 line_str = " ".join(seg_line) + "\n" text_cut.append(line_str) return text_cut nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8') with open(filename, 'r', encoding='utf-8') as f: for line in f.readlines(): line_list = nlp.tag(line)[0][ 'word'] #分词,返回一个嵌套的列表格式为[{'word':[分好的词], ''}] line_str = " ".join(line_list) + '\n' #将列表连接为字符串 text_cut.append(line_str) return text_cut
class Scanner(object): """ bosonnlp 中文分词 """ def __init__(self): self.nlp_handler = BosonNLP(API_TOKEN_BOSONNLP) def get_tag(self, content, remove_punctuations=False): """ 分词后的结果,返回的是每个词的列表 """ result = self.nlp_handler.tag(content)[0] if remove_punctuations: return [ x for x, y in zip(result['word'], result['tag']) if y[0] != 'w' ] return result['word'] def get_key_word(self, content, segmented=False): """提取关键词""" keywords = self.nlp_handler.extract_keywords(content, 2, segmented) firstkey, secondkey = keywords[0], keywords[1] return firstkey[1] if (firstkey[0] - secondkey[0]) > 0.3\ else ' '.join([firstkey[1], secondkey[1]])
# '纪检部门仍在调查之中。成都商报记者 姚永忠'] # result = nlp.ner(s) # print result # print ' '.join([x for x in result[0]['word']]) fname = 'D:\\Github\\Sentiment-Analysis\\data\\nlpcc_emotion\\train\\neg_raw' all_texts = [x.strip() for x in open(fname).readlines()] for i in range(7000): print "handing "+str(i+1)+"th 100 documents....." start = i*100 end = start+100 if start >= len(all_texts): break texts = all_texts[start:end] # 连续空格只保留1个 繁体转化为简体 新词枚举强度设为3(较强)特殊字符不进行转化 result = nlp.tag(texts, space_mode=1, oov_level=3, t2s=1, special_char_conv=0) f1 = open(fname + '_fenci', 'a') f2 = open(fname + '_pos', 'a') f3 = open(fname + '_cobine', 'a') for d in result: fenci_text = ' '.join([x.encode('utf8') for x in d['word']]) pos_text = ' '.join(['tag_'+x.encode('utf8') for x in d['tag']]) cobine_text = ' '.join([x.encode('utf8') + '/' + y.encode('utf8') for x, y in zip(d['word'], d['tag'])]) f1.write(fenci_text + '\n') f2.write(pos_text + '\n') f3.write(cobine_text + '\n') f1.close() f2.close() f3.close() print 'over'
# -*- encoding: utf-8 -*- from bosonnlp import BosonNLP import os #reference from http://bosonnlp-py.readthedocs.io/#bosonnlp-py nlp = BosonNLP('bosonnlp的API') # or nlp = BosonNLP(os.environ['BOSON_API_TOKEN']) nlp.ner('你好啊', sensitivity=2) nlp.ner(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休']) result = nlp.tag('成都商报记者 姚永忠') format_tag_result(result[0]) result = nlp.tag(['亚投行意向创始成员国确定为57个', '“流量贵”频被吐槽'], oov_level=0) result = nlp.tag("成都商报记者 姚永忠", space_mode=2)
class _BosonNLPWrapper(object): """ NLP object using the BosonNLP API Python SDK. """ news_categories = ['physical education', 'education', 'finance', 'society', 'entertainment', 'military', 'domestic', 'science and technology', 'the internet', 'real estate', 'international', 'women', 'car', 'game'] def __init__(self, api_token=None): try: assert api_token is not None, "Please provide an API token" except AssertionError as e: raise self.token = api_token self.nlp = BosonNLP(self.token) def get_sentiment(self, text): """ Performs sentiment analysis on a text passage (works for Chinese text). See: http://docs.bosonnlp.com/sentiment.html Parameters ---------- text (string): text passage to be analyzed for sentiment Returns ------- dictionary with 'positive' and 'negative' as keys with their respective weights as values >>> nlp = BosonNLPWrapper('') >>> nlp.get_sentiment('不要打擾我') {'positive': 0.3704911989140307, 'negative': 0.6295088010859693} >>> nlp.get_sentiment('我很高興跟你見面') {'positive': 0.856280735624867, 'negative': 0.14371926437513308} """ pos, neg = self.nlp.sentiment(text)[0] return {'positive': pos, 'negative': neg} def classify_news(self, text): """ Classifies news text into 14 different categories. See: http://docs.bosonnlp.com/classify.html Parameters ---------- text (string): text passage to classify into news categories defined in news_categories Returns ------- one of the 14 categories in news_categories that the text was classified into """ numbering = range(len(_BosonNLPWrapper.news_categories)) cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories)) clsfy_num = self.nlp.classify(text)[0] return cats_dict[clsfy_num] def extract_keywords(self, text, top_k=3): """ Extracts the top k keywords and the weight of each word in the text. See: http://docs.bosonnlp.com/keywords.html Parameters ---------- text (string): text passage from which to extract keywords top_k (integer): number of keywords to return Returns ------- list of key-value pairs {word: weight} >>> nlp = BosonNLPWrapper('') >>> nlp.extract_keywords('我最愛老虎堂,奶茶香醇,波霸彈Q 好香的黑糖味') [{'波霸彈': 0.5980681967308248}, {'黑糖': 0.4699792421671365}, {'香醇': 0.4497614275300947}] """ result = self.nlp.extract_keywords(text, top_k) # outputs in sorted order of weight return [{result[i][1]: result[i][0]} for i in range(len(result))] def segment_words_and_tag(self, text): """ Splits up text into segments of "words" and tags them with their respective part of speech. See: http://docs.bosonnlp.com/tag.html Parameters ---------- text (string): text passage to segment into separate "words" and tags them with parts of speech Returns ------- list of key-value pairs {word: part-of-speech-tag} """ result = self.nlp.tag(text)[0] words = result['word'] tags = result['tag'] return [{words[i]: tags[i]} for i in range(len(words))] def get_summary(self, content, title='', pct_limit=0.2): """ Extracts a new digest (summary) of the content. See: http://docs.bosonnlp.com/summary.html Parameters ---------- text (string): text passage to summarize title (string): title of the passage (optional, may provide more accurate results) pct_limit (float): max length of the summary in terms of percentage of the original word count Returns ------- string containing the summary of the passage """ summary = self.nlp.summary(title, content, pct_limit) return summary
# -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals from bosonnlp import BosonNLP import json file_path = r"C:\workspace\Taikor_NLP_service\Thirdparty_NLP_WebAPI\Bosson\corpos\msr_test.txt" with open(file_path, "r", encoding="utf8") as f: s = f.read() nlp = BosonNLP("2ZmFSLeL.3212.Y6W7eOViuyZZ") pos = nlp.tag(s) dump = json.dumps(pos) with open("pos", "w") as f: f.write(dump)
class C2E(object): """ """ def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"), self.opt.seprator, None, None) self.translator = onmt.Translator(opt) self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S") def seg(self, doc): res = "" try: print "using boson....." boson_res = self.nlp.tag(l) res = boson[0]['word'] except: res = jieba.cut(doc, cut_all=False) return " ".join(res) def truecase(self, text): text = text.encode('utf-8') truecase_sents = [] tagged_sent = nltk.pos_tag( [word.lower() for word in nltk.word_tokenize(text)]) normalize_sent = [ w.captitalize() if t in ['NN', 'NNS'] else w for (w, t) in tagged_sent ] normalize_sent[0] = normalize_sent[0].capitalize() pretty_string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent)) return pretty_string def tokenDoc(self, doc): doc = doc.strip() sentenceList = re.split(PAT, doc.decode('utf-8')) assert len(sentenceList) >= 1 if sentenceList[-1].strip() == "": sentenceList = sentenceList[:-1] punctuaList = re.findall(PAT, doc.decode('utf-8')) punctuaList += (len(sentenceList) - len(punctuaList)) * [' '] sents = [ sent + punc for (sent, punc) in zip(sentenceList, punctuaList) ] sents = [sent.strip() for sent in sents] print 'c2e sentenceList : ', sentenceList tokens = [] for sent in sents: sent = sent.lower() #sent = self.detokenizer.unescape_xml(self.tokenizer.tokenize(sent, return_str=True)) sent = self.seg(sent) if self.opt.bpe_codes != "": sent = self.bpe.segment(sent).strip() token = sent.split() tokens += [token] print 'c2e tokens : ', tokens return tokens def translate(self, doc): batch = self.tokenDoc(doc) pred, _, _, _, _ = self.translator.translate(batch, None) rstr = "" for idx in range(len(pred)): pred_sent = ' '.join(pred[idx][0]).replace(' @-@ ', '-').replace( self.sep, '') #pred_sent = self.truecase(pred_sent) pred_sent = pred_sent.capitalize() rstr += pred_sent + "\n" print 'c2e rstr : ', rstr.strip() return rstr.strip()
# -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals from bosonnlp import BosonNLP def stopwordlist(): stopwords = [ line.strip() for line in open('G:\python\_files\error_dc\ChangeStopWords.txt', encoding='UTF-8').readlines() ] return stopwords word_list = [] fR = open('G:\pycharm\DataSet\_1\_train_small.csv', 'r', encoding='utf-8') nlp = BosonNLP('your API token') sent = fR.read() sent_list = nlp.tag(sent) for t in sent_list: if t not in stopwordlist(): word_list.append(t) fW = open('G:\python\out\_test_jieba_big_02.csv', 'w') fW.write(' '.join(word_list)) fR.close() fW.close()
news_dir = '/home/ewan/PycharmProjects/news_spider/news/' file_list = os.listdir(news_dir) nlp = BosonNLP('FuHSE7Vf.13924.jadflTdrQLWx') splitted_titles = [] SIZE = len(file_list) count = 0 stop_tags = ['w', 't', 'q', 'u', 'k', 'h', 'o', 'y', 'c', 'p', 'd', 'r'] for title in file_list: count += 1 if count % (SIZE / 100) == 0: print count result = nlp.tag(title) words = '' for index, word in enumerate(result[0]['word']): stop = False for tag in stop_tags: if tag in result[0]['tag'][index]: stop = True break if stop is False: words += word + ' ' splitted_titles.append(words) def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_):
class _BosonNLPWrapper(object): """ NLP object using the BosonNLP API Python SDK. """ news_categories = [ 'physical education', 'education', 'finance', 'society', 'entertainment', 'military', 'domestic', 'science and technology', 'the internet', 'real estate', 'international', 'women', 'car', 'game' ] def __init__(self, api_token=None): try: assert api_token is not None, "Please provide an API token" except AssertionError as e: raise self.token = api_token self.nlp = BosonNLP(self.token) def get_sentiment(self, text): pos, neg = self.nlp.sentiment(text)[0] return {'positive': pos, 'negative': neg} def classify_news(self, text): numbering = range(len(_BosonNLPWrapper.news_categories)) cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories)) clsfy_num = self.nlp.classify(text)[0] return cats_dict[clsfy_num] def extract_keywords(self, text, top_k=3): result = self.nlp.extract_keywords( text, top_k) # outputs in sorted order of weight return [{result[i][1]: result[i][0]} for i in range(len(result))] def segment_words_and_tag(self, text): """ Splits up text into segments of "words" and tags them with their respective part of speech. See: http://docs.bosonnlp.com/tag.html Parameters ---------- text (string): text passage to segment into separate "words" and tags them with parts of speech Returns ------- list of key-value pairs {word: part-of-speech-tag} """ result = self.nlp.tag(text)[0] words = result['word'] tags = result['tag'] return [{words[i]: tags[i]} for i in range(len(words))] def get_summary(self, content, title='', pct_limit=0.2): """ Extracts a new digest (summary) of the content. See: http://docs.bosonnlp.com/summary.html Parameters ---------- text (string): text passage to summarize title (string): title of the passage (optional, may provide more accurate results) pct_limit (float): max length of the summary in terms of percentage of the original word count Returns ------- string containing the summary of the passage """ summary = self.nlp.summary(title, content, pct_limit) return summary
# -*- coding: utf-8 -*- import os import sys import datetime import time from bosonnlp import BosonNLP myApiToken = "X0njNWj2.5612.pYnhvqV02Kgn" nlp = BosonNLP(myApiToken) for eachLine in open("simple.txt"): # print eachLine,type(eachLine) # break # print nlp.extract_keywords(eachLine) result = nlp.tag(eachLine) print result # print nlp.sentiment("这家味道还不错") # print nlp.extract_keywords("instructor.txt")
### 1. 基本用法 from bosonnlp import BosonNLP words_list = list() nlp = BosonNLP('g8lQg9Mxx.25818.fAbbwt6TYhh8') # 使用token result = nlp.tag('承德市长江大桥') print(result) print(result[0]['word']) print(result[0]['tag']) for i in range(len(result[0]['word'])): print(result[0]['word'][i] + '/' + result[0]['tag'][i], end=' ') print() print(' '.join([a + '/' + b for a, b in zip(result[0]['word'], result[0]['tag'])])) ### 2. 一次处理5篇文章的方法 from bosonnlp import BosonNLP import requests tokens = ['g8lQxxMv.25818.fAbbwt6TYhh8',] #boson api token # Check Usage Time HEADERS = {'X-Token': tokens[0]} RATE_LIMIT_URL = 'http://api.bosonnlp.com/application/rate_limit_status.json' result = requests.get(RATE_LIMIT_URL, headers=HEADERS).json() canUseTime = result['limits']['tag']['count-limit-remaining']
def tag(entity): nlp = BosonNLP(boson_token) result = nlp.tag(entity) for d in result: print(' '.join(['%s/%s' % it for it in zip(d['word'], d['tag'])])) return result
# coding: utf-8 """ function:利用bosonnlp的api实现文本的分词和词性标注 token:*************************** author:[email protected] """ from bosonnlp import BosonNLP import os nlp = BosonNLP('****************************') f = open("NBA.txt", "r") # 读取文本 string = f.read().decode("utf-8") result = nlp.tag(string) # 完整的参数调用格式如下: # result = nlp.tag(s, space_mode=0, oov_level=3, t2s=0, special_char_conv=0) # 修改space_mode选项为1,如下: # result = nlp.tag(s, space_mode=1, oov_level=3, t2s=0, special_char_conv=0) # 修改oov_level选项为1,如下: # result = nlp.tag(s, space_mode=0, oov_level=1, t2s=0, special_char_conv=0) # 修改t2s选项为1,如下: # result = nlp.tag(s, space_mode=0, oov_level=3, t2s=1, special_char_conv=0) # 修改特殊字符转换选项为1,如下: # result = nlp.tag(s, space_mode=0, oov_level=3, t2s=0, special_char_conv=1) f.close() for d in result: print(' '.join(d['word'])) # 分词结果 for t in result: for word, tag in zip(t['word'], t['tag']): print word + " " + tag # 词性标注结果
if html.has_key('error'): return 0 return float(html['similarity']) def hownet_sentence_sim(s1, s2): data = {'apiKey': "vpze450m", 'text1': s1, 'text2': s2} url = 'http://yuzhinlp.com/api/getShortSimilarityApi.do' html = requests.post(url, data).text s = requests.session() s.keep_alive = False html = json.loads(html, encoding='utf-8') if html.has_key('error'): return 0 return float(html['success']) if __name__ == "__main__": scentences1 = readFFile(r"testSet/trainSet.txt") scentences2 = readFFile(r"testSet/testSet1.txt") nlp = BosonNLP('wx3Ua05Y.21658.Ch876jBfuqIH') # 获取分词结果 tags1 = nlp.tag(scentences1[16]) tags2 = nlp.tag(scentences2) wordA = Denoising(tags1[16]) wordB = Denoising(tags2[16]) print hownet_sentence_sim(scentences1[16], scentences2[16])
# -*- encoding: utf-8 -*- from __future__ import print_function, unicode_literals from bosonnlp import BosonNLP input_txt = open('邬贺铨_搜狗百科.txt', 'r', encoding='utf-8') # 有的文件编码使用GBK形式,在读文件时需要再添加一个参数:encoding='utf-8' # 有的记事本文件编码使用ANSI,读文件添加encoding='utf-8'反而会报错 lines = input_txt.readlines() for line in lines: nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8') result = nlp.tag(line)[0]['word'] output_txt = open('邬贺铨_搜狗百科_split_unattributed.txt', mode='a', encoding='utf-8') # output_txt.write('{}\n'.format(result)) # 以列表字符串的形式写入 output_txt.write('{}\n'.format(' '.join(result))) # 以纯文本的形式写入 output_txt.close() # # 注意:在测试时请更换为您的API token。 # nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8') # s = '游戏很喜欢,希望赶紧过了无法打开家园这一块,要不买之前给哥提醒也可以,买完之后告诉玩家进不去要等审核有一点小生气。' # result = nlp.tag(s)[0]['word'] # print(' '.join(result)) # print(result) # 注意:在测试时请更换为您的API token。 # nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8') # # s = ['亚投行意向创始成员国确定为57个', '“流量贵”频被吐槽'] #
def load_data(filepath): with open(filepath + 'train.json', 'r', encoding='utf-8') as data_train, open( filepath + 'test.json', 'r', encoding='utf-8') as data_test, open( filepath + 'data_train.txt', 'w+', encoding='utf-8') as train_data, open( filepath + 'data_test.txt', 'w+', encoding='utf-8') as test_data: nlp = BosonNLP('sPB-JflO.34520.7EXOGbw_13LD') i = 0 for item in jl.Reader(data_train): # sentence = nlp.tag(item['fact'])[0]['word'] relevant_articles = item["meta"]["relevant_articles"] if len(relevant_articles) >= 2: continue else: relevant_articles = relevant_articles[0] accusation = item['meta']['accusation'] if len(accusation) >= 2: continue else: accusation = accusation[0] sentence = nlp.tag(item['fact'])[0]['word'] imprisonment = item['meta']['term_of_imprisonment']['imprisonment'] if imprisonment > 180: continue death_penalty = item['meta']['term_of_imprisonment'][ 'death_penalty'] life_imprisonment = item['meta']['term_of_imprisonment'][ 'life_imprisonment'] if (death_penalty is True) or (life_imprisonment is True): train_data.write(' '.join(sentence) + ' ' + str(relevant_articles) + ' ' + accusation + ' ' + str(400) + '\n') else: train_data.write(' '.join(sentence) + ' ' + str(relevant_articles) + ' ' + accusation + ' ' + str(imprisonment) + '\n') i += 1 print(i) j = 0 for item in jl.Reader(data_test): # sentence = nlp.tag(item['fact'])[0]['word'] relevant_articles = item["meta"]["relevant_articles"] if len(relevant_articles) >= 2: continue else: relevant_articles = relevant_articles[0] accusation = item['meta']['accusation'] if len(accusation) >= 2: continue else: accusation = accusation[0] sentence = nlp.tag(item['fact'])[0]['word'] imprisonment = item['meta']['term_of_imprisonment']['imprisonment'] if imprisonment > 180: continue death_penalty = item['meta']['term_of_imprisonment'][ 'death_penalty'] life_imprisonment = item['meta']['term_of_imprisonment'][ 'life_imprisonment'] if (death_penalty is True) or (life_imprisonment is True): test_data.write(' '.join(sentence) + ' ' + str(relevant_articles) + ' ' + accusation + ' ' + str(400) + '\n') else: test_data.write(' '.join(sentence) + ' ' + str(relevant_articles) + ' ' + accusation + ' ' + str(imprisonment) + '\n') j += 1 print(j)
class BosonNlpp: def __init__(self): self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB') #情感分析 def testSentiment(self, s): result = self.bonlp.sentiment(s) return result #print(result) #命名实体识别 def lexicalAnalysis(self, s): result = self.bonlp.ner(s)[0] return result #依存文法分析 def textDependency(self, s): result = self.bonlp.depparser(s) return result #关键词提取 def testKeywords(self, s): result = self.bonlp.extract_keywords(s, top_k=10) return result #新闻分类 def textClassify(self, s): resultlist = self.bonlp.classify(s) classifys = { 0: '体育', 1: '教育', 2: '财经', 3: '社会', 4: '娱乐', 5: '军事', 6: '国内', 7: '科技', 8: '互联网', 9: '房产', 10: '国际', 11: '女人', 12: '汽车', 13: '游戏' } return (classifys[resultlist[0]]) #语义联想 def lexicalSynonym(self, term): result = self.bonlp.suggest(term, top_k=10) return result #分词与词性标注 def fenci(self, s): result = self.bonlp.tag(s) return result def newssubstract(self, s): #s=s.encode('utf8') s = s.decode('utf-8') result = self.bonlp.summary('', s) return result
def Text_Seg_By_BosonNLP(line): nlp = BosonNLP('QhCMB7FS.33943.0OYvhfw0JCx8') words = nlp.tag(line)[0]['word'] # output_txt.write('{}\n'.format(result)) # 以列表字符串的形式写入 # seg_words = ' '.join(result) # 以纯文本的形式写入 return words
with open('news.pkl', 'rb') as f: news = pickle.load(f) nlp = BosonNLP('FuHSE7Vf.13924.jadflTdrQLWx') splitted_news = {} SIZE = len(news) count = 0 green_tags = ['n', 's', 'v'] for key, value in news.items(): count += 1 if count % (SIZE / 100) == 0: print count result = nlp.tag([value]) words = '' for index, word in enumerate(result[0]['word']): green = False for tag in green_tags: if tag in result[0]['tag'][index]: green = True break if green: words += word + ' ' # print words # splitted_news[key] = re.split(ur'\s+', words) # print sys.getdefaultencoding() splitted_news[key] = words