def getAnswerKeys(text_set, api_key): keys = [] nlp = BosonNLP(api_key) result = nlp.extract_keywords(text_set, top_k=30) for weight, word in result: keys.append((word, int(weight * 10))) return keys
def getAnswerKeys (text_set, api_key): keys = [] nlp = BosonNLP(api_key) result = nlp.extract_keywords(text_set, top_k=30) for weight, word in result: keys.append((word, int(weight*10))) return keys
def getKeyWords(command): nlp = BosonNLP("IrtCRUKX.4360.giOuq49VR3V-") r = nlp.extract_keywords(command, top_k=3) l = [] for k, v in r: v = v.encode('utf8') l.append(v) return l
def getKeyWords(command): nlp = BosonNLP("IrtCRUKX.4360.giOuq49VR3V-") r = nlp.extract_keywords(command, top_k=3) l = [] for k, v in r: v = v.encode("utf8") l.append(v) return l
def getKeyWords(command): nlp = BosonNLP("ofW2OZMI.4712.UzT0VvLGGkdi") r = nlp.extract_keywords(command, top_k=3) l = [] for k,v in r: v = v.encode('utf8') l.append(v) return l
def extract_keywords(text, top_num=10): """Extract Keywords.""" # 注意:在测试时请更换为您的 API token nlp = BosonNLP('') result = nlp.extract_keywords(text, top_k=top_num) result_dict = {k: v for (v, k) in result} return result_dict
def extract_keywords(text, top_num=10): """Extract Keywords.""" # 注意:在测试时请更换为您的 API token nlp = BosonNLP("") result = nlp.extract_keywords(text, top_k=top_num) result_dict = {k: v for (v, k) in result} return result_dict
class Scanner(object): """ bosonnlp 中文分词 """ def __init__(self): self.nlp_handler = BosonNLP(API_TOKEN_BOSONNLP) def get_tag(self, content, remove_punctuations=False): """ 分词后的结果,返回的是每个词的列表 """ result = self.nlp_handler.tag(content)[0] if remove_punctuations: return [ x for x, y in zip(result['word'], result['tag']) if y[0] != 'w' ] return result['word'] def get_key_word(self, content, segmented=False): """提取关键词""" keywords = self.nlp_handler.extract_keywords(content, 2, segmented) firstkey, secondkey = keywords[0], keywords[1] return firstkey[1] if (firstkey[0] - secondkey[0]) > 0.3\ else ' '.join([firstkey[1], secondkey[1]])
def Key_word(text): nlp = BosonNLP("x-gOGutn.27554.G6_6QvdJafES") rest = nlp.extract_keywords(text, top_k=20) return rest
class _BosonNLPWrapper(object): """ NLP object using the BosonNLP API Python SDK. """ news_categories = [ 'physical education', 'education', 'finance', 'society', 'entertainment', 'military', 'domestic', 'science and technology', 'the internet', 'real estate', 'international', 'women', 'car', 'game' ] def __init__(self, api_token=None): try: assert api_token is not None, "Please provide an API token" except AssertionError as e: raise self.token = api_token self.nlp = BosonNLP(self.token) def get_sentiment(self, text): pos, neg = self.nlp.sentiment(text)[0] return {'positive': pos, 'negative': neg} def classify_news(self, text): numbering = range(len(_BosonNLPWrapper.news_categories)) cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories)) clsfy_num = self.nlp.classify(text)[0] return cats_dict[clsfy_num] def extract_keywords(self, text, top_k=3): result = self.nlp.extract_keywords( text, top_k) # outputs in sorted order of weight return [{result[i][1]: result[i][0]} for i in range(len(result))] def segment_words_and_tag(self, text): """ Splits up text into segments of "words" and tags them with their respective part of speech. See: http://docs.bosonnlp.com/tag.html Parameters ---------- text (string): text passage to segment into separate "words" and tags them with parts of speech Returns ------- list of key-value pairs {word: part-of-speech-tag} """ result = self.nlp.tag(text)[0] words = result['word'] tags = result['tag'] return [{words[i]: tags[i]} for i in range(len(words))] def get_summary(self, content, title='', pct_limit=0.2): """ Extracts a new digest (summary) of the content. See: http://docs.bosonnlp.com/summary.html Parameters ---------- text (string): text passage to summarize title (string): title of the passage (optional, may provide more accurate results) pct_limit (float): max length of the summary in terms of percentage of the original word count Returns ------- string containing the summary of the passage """ summary = self.nlp.summary(title, content, pct_limit) return summary
def get_similar_text(text): BosonNLPKey = "BMRivntt.8194.5zwvLwj_ygkV" # ZfXxO6kv.10841.LZ_TDcJiiwrl nlp = BosonNLP(BosonNLPKey) # 获取关键词 [[0.8391345017584958, '病毒式'], [0.3802418301341705, '蔓延']] keyword_result = nlp.extract_keywords(text, top_k=10) print(keyword_result) # keyword_result = [[0.36443758441765906, '卖艺'], [0.26732109821670036, '路学院'], [0.2667011448568187, '挣钱'], [0.23801264121689353, '天目山'], [0.22618147770853975, '走时'], [0.22553389408212396, '孩子'], [0.21006863070452944, '一老一少'], [0.18830464004379927, '儿子'], [0.18298766176875855, '育才'], [0.17494405471962107, '红绿灯']] key_word_list = [] for key_item in keyword_result: if key_item[0] > 0.15 and len(key_item[1]) < 10: key_word_list.append((key_item[1], key_item[0])) conn = pymysql.connect(host='127.0.0.1', user='******', passwd='Ryan', db="omdb", charset='UTF8') cur = conn.cursor() vec_tfidf = [] for key in key_word_list: sql_str = "SELECT kid from opinionmonitor_keyword WHERE word = '{0}'".format( key[0]) cur.execute(sql_str) for ii in cur: vec_tfidf.append((ii[0], key[1])) pid_list = [] for key_id in vec_tfidf: sql_str = "SELECT pid_id from opinionmonitor_passagekeyword WHERE kid_id = {0}".format( key_id[0]) cur.execute(sql_str) for ii in cur: pid_list.append(ii[0]) # 统计重复次数 pid_set = set(pid_list) corpus_tfidf = [] p_id_list = [] for pid in pid_set: c = pid_list.count(pid) # 大于2才加入结果集中 if c > 0: sql_str = "SELECT kid_id, ratio from opinionmonitor_passagekeyword WHERE pid_id = {0}".format( pid) cur.execute(sql_str) kr_list = [] for ii in cur: kr_list.append((ii[0], ii[1])) corpus_tfidf.append(kr_list) p_id_list.append(pid) cur.close() #关闭游标 conn.commit() #向数据库中提交任何未解决的事务,对不支持事务的数据库不进行任何操作 conn.close() #关闭到数据库的连接,释放数据库资源 # 计算cos距离 print("比较矩阵") print(corpus_tfidf) print("文本向量") print(vec_tfidf) if corpus_tfidf == []: return dict() index = similarities.MatrixSimilarity(corpus_tfidf) sims = index[vec_tfidf] r_pid_dict = dict() sim_result_list = list(sims) for index, item in enumerate(sim_result_list): if item > 0.5: r_pid_dict[p_id_list[index]] = item result_list = [] result_order_dict = OrderedDict() r_pid_dict = OrderedDict(sorted(r_pid_dict.items(), key=lambda x: x[1])) print(r_pid_dict) for k, v in r_pid_dict.items(): passage_list = Passage.objects.filter(pid=k) passage = passage_list[0] passage.st = v item_tuple = passage.to_tuple() result_order_dict[item_tuple[0]] = item_tuple[1] # result_list.append(passage.to_tuple()) return result_order_dict
def main(): global last_extrect_tag_time, last_extrect_tag_time token = json.load(open('./config/token.json','r')) if DEBUG: client = EvernoteClient(token=token['en_token'],sandbox=True) else: client = EvernoteClient(token=token['en_token']) client.service_host = 'app.yinxiang.com' print '现在服务器是:',client._get_endpoint() #bosonNlp nlp = BosonNLP(token['boson_nlp_token']) note_store = client.get_note_store() #获取上一次同步状态 if os.path.exists(data_file('sync_state')): last_sync_state = json.load(open(data_file('sync_state'),'r')) last_update_count = last_sync_state['updateCount'] last_extrect_tag_time = last_sync_state['currentTime'] #获取当前同步状态 currnet_sync_state = get_current_sync_state(note_store) if currnet_sync_state.updateCount > last_update_count: new_updated_count = currnet_sync_state.updateCount - last_update_count print currnet_sync_state.__dict__ new_note_filter = NoteStore.NoteFilter() new_note_filter.order = Types.NoteSortOrder.CREATED new_notes = note_store.findNotes(new_note_filter,0,new_updated_count) print 'totalNumber:%d\tNoteListNum:%d' %(new_notes.totalNotes,len(new_notes.notes)) else: print('没有新增更新...') exit(1) # 获取用户的所有tags tags = Tags(note_store=note_store) alltags = tags.tags print '标签云:\n' print '\t'.join(alltags.keys()) #操作新note for note in new_notes.notes: #如果笔记创建时间小于上次同步时间 if note.created <= last_extrect_tag_time: continue print '\n'+'*'*120 content = note_store.getNoteContent(note.guid) print "guid:%s\ntitle:%s\ncreated:%s\n作者:%s" %(note.guid,note.title,note.created,note.attributes.author) print 'author:%s\nsource:%s\nsourceURL:%s\nsourceApplication:%s' %(note.attributes.author,note.attributes.source,note.attributes.sourceURL,note.attributes.sourceApplication) if not note.attributes.sourceURL: continue print "现有标签(tags):%s" %(",".join(note_store.getNoteTagNames(note.guid))) print '-'*120 #print "内容(%d):created:%s,\n%s" %(note.contentLength,note.created,content) #解析note xml 提取出所有的文字 try: parser = ET.XMLParser() parser.entity['nbsp'] = '' parser.entity['ldquo'] = '' parser.entity['rdquo'] = '' parser.entity['hellip'] = '' tree = ET.parse(StringIO(content),parser=parser) except Exception,data: print 'ElementTree parser error' print content print 'errorData:' print data print 'exception:' print Exception exit(1) en_note = tree.findall('.')[0] content_string = ''.join(en_note.itertext()) #写入文件 with codecs.open(note_file(note.guid),'w+',encoding='utf-8') as f: f.write(content_string) #通过BosonNLP 拿到文章命名实体 ner_tag_guid_list = [] ner_tag_name_list = [] ner = Ner(content_string).process(nlp) entites = ner.collect_type_entity(count=1) for entity in entites: tag = tags.add(entity) ner_tag_guid_list.append(tag.guid) ner_tag_name_list.append(tag.name) #通过 BosonNLP 拿到文章的关键字 extract_keywords = nlp.extract_keywords(content_string,top_k=20) keywords = [item[1].upper() for item in extract_keywords] print '通过 BosonNLP extract_keywords 拿到文章的前20个关键字:' for keyword in extract_keywords: print '%s \t %s' %(keyword[1],keyword[0]) print '-'*120 #对比 找出交集tag的guid keywords_tag_guid_list = [] newKeyWords = [] for keyword in keywords: if tags.exist(keyword): existTag = tags.get(keyword) keywords_tag_guid_list.append(existTag.guid) newKeyWords.append(existTag.name) print '\nextract_keywords与自己所有tag的交集:' print '\t'.join(newKeyWords) #追加新笔记的tags new_tag_guid_list = list(set(keywords_tag_guid_list).union(set(ner_tag_guid_list))) print 'extract_keywords+ner的tag:' newKeyWords.extend(ner_tag_name_list) print '\t'.join(newKeyWords) if note.tagGuids: note.tagGuids.extend(new_tag_guid_list) else: note.tagGuids = new_tag_guid_list note_store.updateNote(note)
results.append(item[_attribution]) # pprint.pprint(item[_attribution]) return results for item in _cursor: pprint.pprint(item) return # Connect to mongodb client = MongoClient("mongodb://localhost:27017/") db = client.lagou collection = db.lagouJob # Find by company name cursor = collection.find({"companyName":"深圳市贝利美维软件有限公司"}) # Find by Job Title # cursor = collection.find({"jobTitle":{"$regex": u"区块链"}}) result = decodeResult(cursor, "jobDescription") resultContent = "".join(result) nlpResult = nlp.extract_keywords(resultContent, top_k=100) f = open('companyKeywords.txt', 'w') for weight, word in nlpResult: outString = str(weight) + " "+ str(word) + "\n" print(weight, word) f.write(outString) f.close()
i = 77 id = rlaq_u2.loc[i, 'id'] ajbh = rlaq_u2.loc[i, 'ajbh'] fssj = pd.to_datetime(rlaq_u2.loc[i, 'time']) txt = rlaq_u2.loc[i, 'jyaq'] txt0 = txt place = rlaq_u2.loc[i, 'place'] print(txt0) #提取时间、地点、人物 result = nlp.ner(txt)[0] words = result['word'] entities = result['entity'] Btime = [] Bplace = [] Bpeople = [] for entity in entities: if entity[2] == 'time': Btime.append(''.join(words[entity[0]:entity[1]])) if entity[2] == 'location': Bplace.append(''.join(words[entity[0]:entity[1]])) if entity[2] == 'person_name': Bpeople.append(''.join(words[entity[0]:entity[1]])) print('时间:', ' | '.join(Btime)) print('地点:', ' | '.join(Bplace)) print('人物:', ' | '.join(Bpeople)) #提取关键词 kw = nlp.extract_keywords(txt, top_k=10) for w, k in kw: print('关键词:', k, ' , ', '权重:', w)
class BosonNlpp: def __init__(self): self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB') #情感分析 def testSentiment(self, s): result = self.bonlp.sentiment(s) return result #print(result) #命名实体识别 def lexicalAnalysis(self, s): result = self.bonlp.ner(s)[0] return result #依存文法分析 def textDependency(self, s): result = self.bonlp.depparser(s) return result #关键词提取 def testKeywords(self, s): result = self.bonlp.extract_keywords(s, top_k=10) return result #新闻分类 def textClassify(self, s): resultlist = self.bonlp.classify(s) classifys = { 0: '体育', 1: '教育', 2: '财经', 3: '社会', 4: '娱乐', 5: '军事', 6: '国内', 7: '科技', 8: '互联网', 9: '房产', 10: '国际', 11: '女人', 12: '汽车', 13: '游戏' } return (classifys[resultlist[0]]) #语义联想 def lexicalSynonym(self, term): result = self.bonlp.suggest(term, top_k=10) return result #分词与词性标注 def fenci(self, s): result = self.bonlp.tag(s) return result def newssubstract(self, s): #s=s.encode('utf8') s = s.decode('utf-8') result = self.bonlp.summary('', s) return result
class _BosonNLPWrapper(object): """ NLP object using the BosonNLP API Python SDK. """ news_categories = ['physical education', 'education', 'finance', 'society', 'entertainment', 'military', 'domestic', 'science and technology', 'the internet', 'real estate', 'international', 'women', 'car', 'game'] def __init__(self, api_token=None): try: assert api_token is not None, "Please provide an API token" except AssertionError as e: raise self.token = api_token self.nlp = BosonNLP(self.token) def get_sentiment(self, text): """ Performs sentiment analysis on a text passage (works for Chinese text). See: http://docs.bosonnlp.com/sentiment.html Parameters ---------- text (string): text passage to be analyzed for sentiment Returns ------- dictionary with 'positive' and 'negative' as keys with their respective weights as values >>> nlp = BosonNLPWrapper('') >>> nlp.get_sentiment('不要打擾我') {'positive': 0.3704911989140307, 'negative': 0.6295088010859693} >>> nlp.get_sentiment('我很高興跟你見面') {'positive': 0.856280735624867, 'negative': 0.14371926437513308} """ pos, neg = self.nlp.sentiment(text)[0] return {'positive': pos, 'negative': neg} def classify_news(self, text): """ Classifies news text into 14 different categories. See: http://docs.bosonnlp.com/classify.html Parameters ---------- text (string): text passage to classify into news categories defined in news_categories Returns ------- one of the 14 categories in news_categories that the text was classified into """ numbering = range(len(_BosonNLPWrapper.news_categories)) cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories)) clsfy_num = self.nlp.classify(text)[0] return cats_dict[clsfy_num] def extract_keywords(self, text, top_k=3): """ Extracts the top k keywords and the weight of each word in the text. See: http://docs.bosonnlp.com/keywords.html Parameters ---------- text (string): text passage from which to extract keywords top_k (integer): number of keywords to return Returns ------- list of key-value pairs {word: weight} >>> nlp = BosonNLPWrapper('') >>> nlp.extract_keywords('我最愛老虎堂,奶茶香醇,波霸彈Q 好香的黑糖味') [{'波霸彈': 0.5980681967308248}, {'黑糖': 0.4699792421671365}, {'香醇': 0.4497614275300947}] """ result = self.nlp.extract_keywords(text, top_k) # outputs in sorted order of weight return [{result[i][1]: result[i][0]} for i in range(len(result))] def segment_words_and_tag(self, text): """ Splits up text into segments of "words" and tags them with their respective part of speech. See: http://docs.bosonnlp.com/tag.html Parameters ---------- text (string): text passage to segment into separate "words" and tags them with parts of speech Returns ------- list of key-value pairs {word: part-of-speech-tag} """ result = self.nlp.tag(text)[0] words = result['word'] tags = result['tag'] return [{words[i]: tags[i]} for i in range(len(words))] def get_summary(self, content, title='', pct_limit=0.2): """ Extracts a new digest (summary) of the content. See: http://docs.bosonnlp.com/summary.html Parameters ---------- text (string): text passage to summarize title (string): title of the passage (optional, may provide more accurate results) pct_limit (float): max length of the summary in terms of percentage of the original word count Returns ------- string containing the summary of the passage """ summary = self.nlp.summary(title, content, pct_limit) return summary