def get_bert(str1, bertNum): str1 = str1.strip().upper() str1 = str1.replace(" ", "") str1 = str1.replace("\t", "").replace("\n", "") list_i = [ii for ii in u"%s" % (str1)] list_ii = [] for iii in list_i: if iii.strip() == "": continue else: list_ii.append(iii) rst = None if bertNum == 0: with BertClient(ip=Config.bert.host, port=Config.bert.port1, port_out=Config.bert.out_port1, show_server_config=False, check_version=False, check_length=False) as bc1: rst = bc1.encode([list_ii], is_tokenized=True) else: with BertClient(ip=Config.bert.host, port=Config.bert.port2, port_out=Config.bert.out_port2, show_server_config=False, check_version=False, check_length=False) as bc2: rst = bc2.encode([list_ii], is_tokenized=True) entitys = [i for i in get_word(rst[0], list_ii) if len(i) > 1] #打印结果 logging.info(("实体:" + "\t".join(entitys) + "\n" + str(rst[0]))) return entitys
def __init__(self, model_name, ip=None): config = Config() self.model_name = model_name if self.model_name == "fasthan": self.nltk_model = FastHan(model_type="base") if self.model_name == "stanford": path = config.project_dir self.stanford_model = StanfordCoreNLP(os.path.join(path, 'model\stanford-corenlp-full-2016-10-31'), lang='zh') if self.model_name == "bbc": if not ip: raise ValueError("bbc模型必须填入ip") self.bbc_model = BertClient(ip, ner_model_dir=None, show_server_config=False, check_version=False, check_length=False, mode='NER')
def predict_def(sentences): sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sk.settimeout(1) flag = 0 try: sk.connect(('159.226.125.191', 5610)) flag = 1 except: pass sk.close() if not flag: print('def server not available') return 0 else: with BertClient(port=5610, port_out=5611, show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: result = bc.encode(sentences) k = 0 labels = [] for re in result: for a in re['score']: if a[1] > 0.9: labels.append(1) else: labels.append(0) k += 1 return labels
def info_retrieval(segments, targets): cfg = MyConfig() for target_string in segments: with BertClient( ip=cfg.ner_ip_addr.value, port=cfg.ner_port_in.value, port_out=cfg.ner_port_out.value, mode="NER", ) as bc: rst = bc.encode([list(target_string)], is_tokenized=True) zipped = list(zip(list(target_string), rst[0])) rolling_content = "" rolling_tag = "O" for i in range(len(zipped)): current_tag = zipped[i][1].split("-")[-1] if current_tag != rolling_tag or zipped[i][1].startswith("B"): targets.setdefault(rolling_tag, []) targets[rolling_tag].append(rolling_content) rolling_tag = current_tag rolling_content = "" rolling_content += zipped[i][0] if len(rolling_content) != 0: targets.setdefault(rolling_tag, []) targets[rolling_tag].append(rolling_content) del targets["O"] return targets
def ner_test(str): with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() # str = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。' rst = bc.encode([str, str]) print('rst:', rst) print(time.perf_counter() - start_t)
def ner_on_work(str_input): with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() str_input_list = list(str_input) rst = bc.encode([str_input]) result = list(rst[0]) print('rst:', result) print(time.perf_counter() - start_t) entity_list = [] entity_list_number = 0 if (result.count('B-LOC') == 1 and result.count('B-ORG') == 0 and result.count('B-PER') == 0) \ or (result.count('B-LOC') == 0 and result.count('B-ORG') == 1 and result.count('B-PER') == 0) \ or (result.count('B-LOC') == 0 and result.count('B-ORG') == 0 and result.count('B-PER') == 1): for every in result: if every != 'O': entity_list.append(str_input_list[entity_list_number]) entity_list_number += 1 entity_str = "".join(entity_list) return entity_str else: print('属于多实体问题,需要单独进行处理') return None
def predict_sentences(sentences): sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sk.settimeout(1) flag = 0 try: sk.connect(('159.226.125.191', 5730)) flag = 1 except: pass sk.close() if not flag: print('mask sentence server not available') return 0, 0 else: predict_examples = [] i = 0 for sen in sentences: masked_abs = '' mask = ' AAAA ' + 'AAAA ' * 23 + 'AAAA. ' if i == 0: masked_abs = mask + ' '.join(sentences[1:]) elif i == len(sentences) - 1: masked_abs = ' '.join(sentences[:-1]) + mask else: masked_abs = ' '.join(sentences[:i]) + mask + ' '.join( sentences[i + 1:]) predict_examples.append(masked_abs) i += 1 for sen in sentences: predict_examples.append(sen) i += 1 with BertClient(port=5730, port_out=5731, show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: result = bc.encode(predict_examples) result1 = [] result2 = [] j = 1 for re in result: for r in re['score']: if j <= i / 2: result1.append(r) else: result2.append(r) j += 1 results = np.array(result1) + np.array(result2) # print('results: ',results) results = results.tolist() # print('results: ',results) return results, sentences
def class_pred(): # 获取请求参数 params = request.json print(params) id = params["id"] list_text = params["text"] #文本拆分成句子 #list_text = cut_sent(text) print("total setance: %d" % (len(list_text))) with BertClient(ip='192.168.50.131', port=5575, port_out=5576, show_server_config=False, check_version=False, check_length=False, timeout=10000, mode='CLASS') as bc: start_t = time.perf_counter() rst = bc.encode(list_text) rst[0]["id"] = id print('result:', rst) print('time used:{}'.format(time.perf_counter() - start_t)) #返回结构为: # rst: [{'pred_label': ['0', '1', '0'], 'score': [0.9983683228492737, 0.9988993406295776, 0.9997349381446838]}] #抽取出标注结果 # pred_label = rst[0]["pred_label"] # result_txt = [ [pred_label[i],list_text[i] ] for i in range(len(pred_label))] # print(result_txt) # return result_txt[0] return rst[0]
def extract_contact(self): with BertClient(**bert_153) as bc: contact_info = self.contact_info.replace(' ', ',') if contact_info: poses = list( bc.encode([list(contact_info)], is_tokenized=True)[0]) print(poses) self.contact_from_pos(poses)
def get_ner_list(sentences): args = get_args_parser() bert_dir = r'NER_model/chinese_L-12_H-768_A-12' tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( bert_dir, 'vocab.txt'), do_lower_case=args.do_lower_case) bc = BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') rst = bc.encode(sentences) res = NER_Result() entities = [] # print('rst:', rst) for (one_str, one_rst) in zip(sentences, rst): ners = res.result_to_json(tokenizer.tokenize(one_str), one_rst) entities.append(ners) return entities
def class_test(): with BertClient(port=5557, port_out=5558, show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: start_t = time.perf_counter() str = '北京时间2月17日凌晨,第69届柏林国际电影节公布主竞赛单元获奖名单,王景春、咏梅凭借王小帅执导的中国影片《地久天长》连夺最佳男女演员双银熊大奖,这是中国演员首次包揽柏林电影节最佳男女演员奖,为华语影片刷新纪录。与此同时,由青年导演王丽娜执导的影片《第一次的别离》也荣获了本届柏林电影节新生代单元国际评审团最佳影片,可以说,在经历数个获奖小年之后,中国电影在柏林影展再次迎来了高光时刻。' str2 = '受粤港澳大湾区规划纲要提振,港股周二高开,恒指开盘上涨近百点,涨幅0.33%,报28440.49点,相关概念股亦集体上涨,电子元件、新能源车、保险、基建概念多数上涨。粤泰股份、珠江实业、深天地A等10余股涨停;中兴通讯、丘钛科技、舜宇光学分别高开1.4%、4.3%、1.6%。比亚迪电子、比亚迪股份、光宇国际分别高开1.7%、1.2%、1%。越秀交通基建涨近2%,粤海投资、碧桂园等多股涨超1%。其他方面,日本软银集团股价上涨超0.4%,推动日经225和东证指数齐齐高开,但随后均回吐涨幅转跌东证指数跌0.2%,日经225指数跌0.11%,报21258.4点。受芯片制造商SK海力士股价下跌1.34%拖累,韩国综指下跌0.34%至2203.9点。澳大利亚ASX 200指数早盘上涨0.39%至6089.8点,大多数行业板块均现涨势。在保健品品牌澳佳宝下调下半财年的销售预期后,其股价暴跌超过23%。澳佳宝CEO亨弗里(Richard Henfrey)认为,公司下半年的利润可能会低于上半年,主要是受到销售额疲弱的影响。同时,亚市早盘澳洲联储公布了2月会议纪要,政策委员将继续谨慎评估经济增长前景,因前景充满不确定性的影响,稳定当前的利率水平比贸然调整利率更为合适,而且当前利率水平将有利于趋向通胀目标及改善就业,当前劳动力市场数据表现强势于其他经济数据。另一方面,经济增长前景亦令消费者消费意愿下滑,如果房价出现下滑,消费可能会进一步疲弱。在澳洲联储公布会议纪要后,澳元兑美元下跌近30点,报0.7120 。美元指数在昨日触及96.65附近的低点之后反弹至96.904。日元兑美元报110.56,接近上一交易日的低点。' str3 = '新京报快讯 据国家市场监管总局消息,针对媒体报道水饺等猪肉制品检出非洲猪瘟病毒核酸阳性问题,市场监管总局、农业农村部已要求企业立即追溯猪肉原料来源并对猪肉制品进行了处置。两部门已派出联合督查组调查核实相关情况,要求猪肉制品生产企业进一步加强对猪肉原料的管控,落实检验检疫票证查验规定,完善非洲猪瘟检测和复核制度,防止染疫猪肉原料进入食品加工环节。市场监管总局、农业农村部等部门要求各地全面落实防控责任,强化防控措施,规范信息报告和发布,对不按要求履行防控责任的企业,一旦发现将严厉查处。专家认为,非洲猪瘟不是人畜共患病,虽然对猪有致命危险,但对人没有任何危害,属于只传猪不传人型病毒,不会影响食品安全。开展猪肉制品病毒核酸检测,可为防控溯源工作提供线索。' rst = bc.encode([str, str2, str3]) print('rst:', rst) print('time used:{}'.format(time.perf_counter() - start_t))
def ner_test(): with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() str1 = '溧阳市国土资源局国有土地使用权招拍挂出让成交公示按照《土地管理法》、《城市房地产管理法》、《招标拍卖挂牌出让国有土地使用权规定》和《招标拍卖挂牌出让国有土地使用权规范》等有关法律法规,遵循公开、公正、公平的原则。我局于至挂牌出让1宗国有土地使用权。现将有关情况公示如下:一、地块基本情况:地块编号NO.2012—77地块位置戴埠镇罛北工业园区杨树垛路东侧六号地块土地用途工业用地土地面积(公顷)2.232出让年限50年成交价(万元)724受让单位江苏宏盛建设机械有限公司土地使用条件:净地;地块位置戴埠镇罛北工业园区杨树垛路东侧六号地块,备注受让单位江苏宏盛建设机械有限公司土地使用条件:二、公示期:2012年07月06日至2012年07月12日三、该宗地双方已签订成交确认书,在30日内签订出让合同,相关事宜在合同中约定四、' # rst = bc.encode([list(str1)], is_tokenized=True) # str1 = list(str1) rst = bc.encode([list(str1)], is_tokenized=True) print('rst:', rst) print(len(rst[0])) print(time.perf_counter() - start_t)
def ner_test(): with BertClient(port=5455, port_out=5456, show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() str = ['把储藏室的灯打开', '关闭警报', '打开窗帘', '总共有多少茶'] for str1 in str: rst = bc.encode([list(str1)], is_tokenized=True) # str1 = list(str1) # rst = bc.encode([str1], is_tokenized=True) print('str: %s, rst: %s' % (str1, rst)) print(len(rst[0])) print(time.perf_counter() - start_t)
def ner_cu_seg(): """ 自定义分字 :return: """ with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() str1 = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。' rst = bc.encode([list(str1)], is_tokenized=True) print('rst:', rst) print(len(rst[0])) print(time.perf_counter() - start_t)
def bertModel(id, list_text): with BertClient(ip='192.168.50.131', port=5575, port_out=5576, show_server_config=False, check_version=False, check_length=False, timeout=10000, mode='CLASS') as bc: rst = bc.encode(list_text) rst[0]["id"] = id return rst[0]
def ner_test(): with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() str1 = '驻马店市第一人民医院彩色多普勒超声诊断仪结果公示' # rst = bc.encode([list(str1)], is_tokenized=True) # str1 = list(str1) rst = bc.encode([list(str1)], is_tokenized=True) print('rst:', rst) print(len(rst[0])) print(time.perf_counter() - start_t)
def class_pred_expert(text): pass lst_Result = [] lstTxt = text.splitlines() print("total Lines: %d" % (len(lstTxt))) bc = BertClient(ip='192.168.15.111', port=5565, port_out=5566, show_server_config=False, check_version=False, check_length=False, timeout=10000, mode='CLASS') start_t = time.perf_counter() for txt in lstTxt: #文本拆分成句子 list_text = cut_sent(txt) intTotal = len(list_text) #print("total setance: %d" % (intTotal) ) #with BertClient(ip='192.168.15.111', port=5565, port_out=5566, show_server_config=False, check_version=False, check_length=False,timeout=10000 , mode='CLASS') as bc: rst = bc.encode(list_text) #print('result:' , rst) #返回结构为: # rst: [{'pred_label': ['0', '0', '0'], 'score': [0.9983683228492737, 0.9988993406295776, 0.9997349381446838]}] #标注结果分类 pred_label = rst[0]["pred_label"] #index_list = np.array(pred_label).argsort().tolist() lstLineResult = [[], [], [], [], []] for x in range(intTotal): #print(x, int(pred_label[x]), list_text[x]) lstLineResult[int(pred_label[x])].append(list_text[x]) #result_txt = [pred_label[index_list[x]] + "\t" + list_text[index_list[x]] for x in range(intTotal)] lst_Result.append(lstLineResult) print('time used:{}'.format(time.perf_counter() - start_t)) return lst_Result
def ner_test(): with BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() str1 = '国务院总理李克强在雄安新区召开会议' str1 = '我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。' str1 = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。' # rst = bc.encode([list(str1)], is_tokenized=True) # str1 = list(str1) rst = bc.encode([str1], is_tokenized=True) print('rst:', rst) print(len(rst[0])) print(time.perf_counter() - start_t)
def simi_test(): with BertClient(show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: start_t = time.perf_counter() str1 = '我想开通花呗' str2 = '我也想开通花呗' str3 = "你好" str4 = "你好呀" sss = str3 + "|||" + str4 ss = str1 + "|||" + str2 rst = bc.encode([ss, sss]) print('rst:', rst) print('time used:{}'.format(time.perf_counter() - start_t))
def class_test(): with BertClient(ip="10.0.46.99", show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: start_t = time.perf_counter() str1 = '如何演好自己的角色,请读《演员自我修养》《喜剧之王》周星驰崛起于穷困潦倒之中的独门秘笈' str2 = "茶树茶网蝽,Stephanitis chinensis Drake,属半翅目网蝽科冠网椿属的一种昆虫" str3 = "丝角蝗科,Oedipodidae,昆虫纲直翅目蝗总科的一个科" str4 = "爱德华·尼科·埃尔南迪斯(1986-),是一位身高只有70公分哥伦比亚男子,体重10公斤,只比随身行李高一些,2010年获吉尼斯世界纪录正式认证,成为全球当今最矮的成年男人" str5 = "《逐风行》是百度文学旗下纵横中文网签约作家清水秋风创作的一部东方玄幻小说,小说已于2014-04-28正式发布" str6 = "禅意歌者刘珂矣《一袖云》中诉知己…绵柔纯净的女声,将心中的万水千山尽意勾勒于这清素画音中" str7 = "《娘家的故事第二部》是张玲执导,林在培、何赛飞等主演的电视剧" str8 = "史雪梅,女,1962年生于陕西三原县,本科毕业,是咸阳市妇女书画协会副会长,就职于陕西省宝鸡峡管理局" rst = bc.encode([str1, str2, str3, str4, str5, str6, str7, str8]) print('rst:', rst) print('time used:{}'.format(time.perf_counter() - start_t))
def class_test(ip="localhost", port=5555, port_out=5556): with BertClient(ip=ip, port=port, port_out=port_out, show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: start_t = time.perf_counter() # str = '北京时间2月17日凌晨,第69届柏林国际电影节公布主竞赛单元获奖名单,王景春、咏梅凭借王小帅执导的中国影片《地久天长》连夺最佳男女演员双银熊大奖,这是中国演员首次包揽柏林电影节最佳男女演员奖,为华语影片刷新纪录。与此同时,由青年导演王丽娜执导的影片《第一次的别离》也荣获了本届柏林电影节新生代单元国际评审团最佳影片,可以说,在经历数个获奖小年之后,中国电影在柏林影展再次迎来了高光时刻。' # str2 = '受粤港澳大湾区规划纲要提振,港股周二高开,恒指开盘上涨近百点,涨幅0.33%,报28440.49点,相关概念股亦集体上涨,电子元件、新能源车、保险、基建概念多数上涨。粤泰股份、珠江实业、深天地A等10余股涨停;中兴通讯、丘钛科技、舜宇光学分别高开1.4%、4.3%、1.6%。比亚迪电子、比亚迪股份、光宇国际分别高开1.7%、1.2%、1%。越秀交通基建涨近2%,粤海投资、碧桂园等多股涨超1%。其他方面,日本软银集团股价上涨超0.4%,推动日经225和东证指数齐齐高开,但随后均回吐涨幅转跌东证指数跌0.2%,日经225指数跌0.11%,报21258.4点。受芯片制造商SK海力士股价下跌1.34%拖累,韩国综指下跌0.34%至2203.9点。澳大利亚ASX 200指数早盘上涨0.39%至6089.8点,大多数行业板块均现涨势。在保健品品牌澳佳宝下调下半财年的销售预期后,其股价暴跌超过23%。澳佳宝CEO亨弗里(Richard Henfrey)认为,公司下半年的利润可能会低于上半年,主要是受到销售额疲弱的影响。同时,亚市早盘澳洲联储公布了2月会议纪要,政策委员将继续谨慎评估经济增长前景,因前景充满不确定性的影响,稳定当前的利率水平比贸然调整利率更为合适,而且当前利率水平将有利于趋向通胀目标及改善就业,当前劳动力市场数据表现强势于其他经济数据。另一方面,经济增长前景亦令消费者消费意愿下滑,如果房价出现下滑,消费可能会进一步疲弱。在澳洲联储公布会议纪要后,澳元兑美元下跌近30点,报0.7120 。美元指数在昨日触及96.65附近的低点之后反弹至96.904。日元兑美元报110.56,接近上一交易日的低点。' # str3 = '新京报快讯 据国家市场监管总局消息,针对媒体报道水饺等猪肉制品检出非洲猪瘟病毒核酸阳性问题,市场监管总局、农业农村部已要求企业立即追溯猪肉原料来源并对猪肉制品进行了处置。两部门已派出联合督查组调查核实相关情况,要求猪肉制品生产企业进一步加强对猪肉原料的管控,落实检验检疫票证查验规定,完善非洲猪瘟检测和复核制度,防止染疫猪肉原料进入食品加工环节。市场监管总局、农业农村部等部门要求各地全面落实防控责任,强化防控措施,规范信息报告和发布,对不按要求履行防控责任的企业,一旦发现将严厉查处。专家认为,非洲猪瘟不是人畜共患病,虽然对猪有致命危险,但对人没有任何危害,属于只传猪不传人型病毒,不会影响食品安全。开展猪肉制品病毒核酸检测,可为防控溯源工作提供线索。' str = '手机还行,但是手机刚开箱时屏幕和背面有很多指纹痕迹,手机壳跟**在地上磨过似的,好几条印子。要不是看在能把这些痕迹擦掉,和闲退货麻烦,就给退了。就不能规规矩矩做生意么。还有送的都是什么吊东西,运动手环垃圾一比,贴在手机后面的固定手环还**是塑料的渡了一层银色,耳机也和图片描述不符,碎屏险已经注册,不知道怎么样。讲真的,要不就别送或者少送,要不,就规规矩矩的,不然到最后还让人觉得不舒服。其他没什么。' str2 = '手机是不是正品不确定后台说话是真不要脸。赠品膜没有次只有更次。耳机十元一只的。全******商家买东西真不能贪百二八十的一个耳机就差出来啦要不是双十一错过去啦绝对不能买她家的什么玩意。手机还发热都怀疑是改装的大家别上当**老板**商家最该死' str3 = '这款手机我之前就用过,感觉不错,所以就在京东上下单了,准备再入手一部,结果到货之后,手机充电口被使用过应该是退货的机子,找客服人员,客服人员不管,这态度也没谁了' rst = bc.encode([str, str2, str3]) print('rst:', rst) print('time used:{}'.format(time.perf_counter() - start_t))
def class_pred(list_text): #文本拆分成句子 #list_text = cut_sent(text) #print("total setance: %d" % (len(list_text)) ) with BertClient(ip='localhost', port=5575, port_out=5576, show_server_config=False, check_version=False, check_length=False, timeout=10000, mode='CLASS') as bc: #start_t = time.perf_counter() rst = bc.encode(list_text) #print('result:', rst) #print('time used:{}'.format(time.perf_counter() - start_t)) result_txt = list_to_json(rst) return result_txt
def __init__(self, max_concurrency=10, **kwargs): """ A thread-safe client object connected to a BertServer Create a BertClient that connects to a BertServer. Note, server must be ready at the moment you are calling this function. If you are not sure whether the server is ready, then please set `check_version=False` and `check_length=False` :type max_concurrency: int :param max_concurrency: the maximum number of concurrent connections allowed """ try: from bert_base.client import BertClient except ImportError: raise ImportError('BertClient module is not available, it is required for serving HTTP requests.' 'Please use "pip install -U bert-serving-client" to install it.' 'If you do not want to use it as an HTTP server, ' 'then remove "-http_port" from the command line.') self.available_bc = [BertClient(**kwargs) for _ in range(max_concurrency)] self.max_concurrency = max_concurrency
def predict_move_masked_labels_model(text): sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sk.settimeout(1) flag = 0 try: sk.connect(('159.226.125.191', 5720)) flag = 1 except: pass sk.close() if not flag: print('mask labels server not available') return 0, 0 predict_examples,sentences = msm_results(text) if not sentences: return 0,0 ## 小于4句话,直接用msm结果 if len(sentences)<4: i = 0 labels = [] for a in predict_examples: labels.append(a.index(max(a))) i += 1 return labels,sentences with BertClient(port=5720,port_out=5721,show_server_config=False, check_version=False, check_length=False, mode='CLASS') as bc: results = bc.encode(predict_examples) i = 0 labels = [] for re in results: for a in re['score']: labels.append(a.index(max(a))) i += 1 return labels,sentences
class Ner(object): def __init__(self, model_name, ip=None): config = Config() self.model_name = model_name if self.model_name == "fasthan": self.nltk_model = FastHan(model_type="base") if self.model_name == "stanford": path = config.project_dir self.stanford_model = StanfordCoreNLP(os.path.join(path, 'model\stanford-corenlp-full-2016-10-31'), lang='zh') if self.model_name == "bbc": if not ip: raise ValueError("bbc模型必须填入ip") self.bbc_model = BertClient(ip, ner_model_dir=None, show_server_config=False, check_version=False, check_length=False, mode='NER') def get_ner(self, string): if self.model_name == "fasthan": answer = self.nltk_model(string, target="NER") return answer[0] elif self.model_name == "nltk": _, answer = fool.analysis(string) return answer elif self.model_name == "stanford": answer = [] res = self.stanford_model.ner(string) for token, tag in res: if tag == "PERSON" or tag == "ORGANIZATION" or tag == "LOCATION": answer.append((token, tag)) return answer elif self.model_name == "bbc": rst = self.bbc_model.encode([list(string)], is_tokenized=True) res = result_to_json(string, rst[0]) return res else: raise ValueError("model_name只能是fasthan、nltk、stanford、bbc四种之一")
File Name: api_service_new Description : api客户端请求服务器,返回标签 Author : 逸轩 date: 2019/10/12 """ import json import re import time from bert_base.client import BertClient bc = BertClient(ip='192.168.9.23', port=5575, port_out=5576, show_server_config=False, check_version=False, check_length=False, mode='CLASS') print('BertClient连接成功') # 切分句子 def cut_sent(txt): # 先预处理去空格等 txt = re.sub('([ \t]+)', r" ", txt) # blank word txt = txt.rstrip() # 段尾如果有多余的\n就去掉它 nlist = txt.split(";") nnlist = [x for x in nlist if x.strip() != ''] # 过滤掉空行 return nnlist
import time from bert_base.client import BertClient with BertClient(show_server_config=False, port=6666, port_out=6667, check_version=False, check_length=False, mode='NER') as bc: start_t = time.perf_counter() str1 = '我有鸡蛋和黄瓜可以做什么?' str2 = '不辣的川菜。' str3 = '推荐一个有香菇的菜' strs = [str1, str2, str3] rst = bc.encode(strs) print('rst:', rst) for st, sen in zip(strs, rst): tmp = [] flag = False for i in range(len(sen)): if sen[i] == 'B-ING': tmp.append(st[i]) flag = True elif sen[i] == 'I-ING': if flag: tmp[-1] = tmp[-1] + st[i] else: tmp.append(st[i]) flag = True else: flag = False
from flask import Flask, url_for, request from bert_base.client import BertClient from werkzeug.contrib.fixers import ProxyFix import json app = Flask(__name__) bc = BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') @app.route('/bert/check', methods=['POST']) def check(): print(str(request.json)) result = bc.encode([str(request.json['text'])]) print(result) print('ab', bc.encode(['hello world.'])) t = {'result': str(result)} return json.dumps(t) @app.route('/test', methods=['POST']) def check(): t = {'result': 'hello'} return json.dumps(t) if __name__ == '__main__': app.wsgi_app = ProxyFix(app.wsgi_app) app.run(host='0.0.0.0', port=8055)
import time from tqdm import tqdm print('start') try: from bert_base.client import BertClient except ImportError: raise ImportError('BertClient module is not available, it is required for serving HTTP requests.' 'Please use "pip install -U bert-serving-client" to install it.' 'If you do not want to use it as an HTTP server, ' 'then remove "-http_port" from the command line.') # 指定服务器的IP 127.0.0.1:49164 5555 # BertClient(ip='xxx', ner_model_dir='D:\Projects\Wunianyue\BERT-BiLSTM-CRF-NER\output', show_server_config=False, check_version=False, check_length=False, mode='NER') with BertClient(mode='NER') as bc: start_t = time.perf_counter() # text = text.replace(' ', '-') data句子间不能有空格。 df_path = r'data/add_data/yizhu_301_1000.txt' # data数据最后一行要为空 df = open(df_path, 'r+', encoding='utf-8') list = [] l=[] # 要把每个字用空格分隔,放入训练? for line in df: if line!='\n': l.append(' '.join(line)) list.append(line[:len(line) - 1]) print(len(list)) print('start') rst = bc.encode(l) # 测试同时输入两个句子,多个输入同理 k = 0 with open("annotationdata/301_1000_BIO.txt", "w", encoding='utf-8') as f: for index in tqdm(range(0,len(rst))):
from bert_base.client import BertClient attrs = ['质量很好,和详情描述一致。','大小比TX2小好多,还好没有买TX2。','这款升级版的价格更低,性能也更好,显示屏很垃圾,用树莓派的时候画质很差。','用这个板子发现真心不亏这个价格,不愧是英伟达出品。'] with BertClient(ip='127.0.0.1', port=5575, port_out=5576, show_server_config=False, check_version=False, check_length=False,timeout=50000 , mode='CLASS') as bc: res1 = bc.encode(attrs) print(res1)