def get_hanlp_entity_weight_dict(prep_article, entity_type, sentence_type='original', sentence_count=4): """ 对每篇文章title和中心句子提取 subject/object/predicate, 并对对应的类型计算每个词的权重 :param prep_article: PreprocessArticle类的实例 :param entity_type: 提取词的类型,取值为 sub/obj/predicate, 分别代表 subject, object, predicate :param sentence_type: 待提取的句子的排序方法,取值为 original/score, 分别代表 文章原始句子顺序,文章句子评分排序 :param sentence_count: 中心句的个数,默认为4,如果为0,这只对title进行提取 :return: subject/object/predicate的词-权重字典 """ entitis = [] # 文章title words = HanLP.parseDependency(prep_article.title).word if entity_type == 'sub': entitis.append(get_hanlp_sub_entity(words)) if entity_type == 'obj': entitis.append(get_hanlp_obj_entity(words)) if entity_type == 'predicate': entitis.append(get_hanlp_predicate_entity(words)) # 文章句子 if sentence_count > 0: # 文章前n个句子 if sentence_type == 'original': for i, sentence in enumerate(prep_article.sentences): if i < sentence_count: words = HanLP.parseDependency(sentence.text).word if entity_type == 'sub': entitis.append(get_hanlp_sub_entity(words)) if entity_type == 'obj': entitis.append(get_hanlp_obj_entity(words)) if entity_type == 'predicate': entitis.append(get_hanlp_predicate_entity(words)) # 文章得分降序前n个句子 if sentence_type == 'score': for i, idx in enumerate(prep_article.descend_sentence_index): if i < sentence_count: words = HanLP.parseDependency( prep_article.sentences[idx].text).word if entity_type == 'sub': entitis.append(get_hanlp_sub_entity(words)) if entity_type == 'obj': entitis.append(get_hanlp_obj_entity(words)) if entity_type == 'predicate': entitis.append(get_hanlp_predicate_entity(words)) entity_weight_dict = calculate_weight(entitis) return entity_weight_dict
def dependency_analysis(sent): result = HanLP.parseDependency(sent) ROOT, SUBJECT, PREDICATE = '核心关系', '主谓关系', '宾' res = dict() key = ['root', 'sub', 'pre', 'sub_adj', 'pre_adj', 'entity'] for word in result.iterator(): type = str(word.DEPREL) if type.find(ROOT) >= 0: res['root'] = word.LEMMA elif type.find(SUBJECT) >= 0: res['sub'] = word.LEMMA elif type.find(PREDICATE) >= 0: res['pre'] = word.LEMMA res['entity'] = [] for word in result.iterator(): if str(word.CPOSTAG).find('n') >= 0 and str( word.CPOSTAG).find('v') < 0: res['entity'].append(word.LEMMA) if res.get('sub') and str(word.HEAD.LEMMA) == str(res['sub']): res['sub_adj'] = res['sub_adj'] + [word.LEMMA] if res.get( 'sub_adj') else [word.LEMMA] else: res['pre_adj'] = res['pre_adj'] + [word.LEMMA] if res.get( 'pre_adj') else [word.LEMMA] for k in key: res[k] = res.get(k, '空') if isinstance(res[k], list): res[k] = '|'.join(res[k]) print(res) return res
def dependency_parse(self, sent, standard_name=False, stopwords=None): """ 依存句法分析,调用pyhanlp的接口,并且融入了harvesttext的实体识别机制。 不保证高准确率。 :param sent: :param standard_name: :param stopwords: :return: arcs:依存弧,列表中的列表。 [[词语id,词语字面值或实体名(standard_name控制),词性,依存关系,依存子词语id] for 每个词语] """ from pyhanlp import HanLP, JClass if not self.hanlp_prepared: self.hanlp_prepare() self.standard_name = standard_name entities_info = self.entity_linking(sent) sent2 = self.decoref(sent, entities_info) # [word.ID-1, word.LEMMA, word.POSTAG, word.DEPREL ,word.HEAD.ID-1] arcs = [] i = 0 sentence = HanLP.parseDependency(sent2) for word in sentence.iterator(): word0, tag0 = word.LEMMA, word.POSTAG if stopwords and word0 in stopwords: continue if word0 in self.entity_types: if self.standard_name: word0 = entities_info[i][1][0] # 使用链接的实体 else: l, r = entities_info[i][0] # 或使用原文 word0 = sent[l:r] tag0 = entities_info[i][1][1][1:-1] i += 1 arcs.append([word.ID-1, word0, tag0, word.DEPREL, word.HEAD.ID-1]) return arcs
def split_test(self, sentence): #line = sentence.strip().decode('utf-8', 'ignore') # 去除每行首尾可能出现的空格,并转为Unicode进行处理 #line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".decode("utf8"), # " ".decode("utf8"), line) #wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) for term in HanLP.segment('下雨天地面积水'): print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] for sentence in testCases: print(HanLP.segment(sentence)) # 关键词提取 document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 2)) # 自动摘要 print(HanLP.extractSummary(document, 3)) # 依存句法分析 print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
def parseDependency(sentence): """ * 依存文法分析 * * @param sentence 待分析的句子 * @return CoNLL格式的依存关系树 """ return HanLP.parseDependency(sentence)
def parseDependency(self, sent): """ 句法分析 :param sent: :return: """ res = res = HanLP.parseDependency(sent) return res
def get_article_event_detail(prep_article, sentence_type='original', sentence_count=4): """ 对每篇文章提取"title"和"评分得到的中心句子通过"依存句法分析"提取"主谓宾"短语 :param prep_article: PreprocessArticle类的实例 :param sentence_type: 待提取的句子的排序方法,取值为 original/score, 分别代表 文章原始句子顺序,文章句子评分排序 :param sentence_count: 中心句的个数,默认为4,如果为0,这只对title进行提取 :return: Phrase实例列表 """ phrases = [] # 文章title words = HanLP.parseDependency(prep_article.title).word # print(prep_article.title) # for word in words: # print(word.ID, word.DEPREL, word.LEMMA, word.CPOSTAG) for word in words: if word.DEPREL == '核心关系' and word.CPOSTAG == 'v': # 核心关系词需要是动词,要不然没有提取的必要 # print("-----"+word.LEMMA) phrases.extend(extract_event_detail(word, words)) # 文章句子 if sentence_count > 0: # 文章前n个句子 if sentence_type == 'original': for i, sentence in enumerate(prep_article.sentences): if i < sentence_count: words = HanLP.parseDependency(sentence.text).word for word in words: if word.DEPREL == '核心关系' and word.CPOSTAG == 'v': phrases.extend(extract_event_detail(word, words)) # 文章得分降序前n个句子 if sentence_type == 'score': for i, idx in enumerate(prep_article.descend_sentence_index): if i < sentence_count: words = HanLP.parseDependency( prep_article.sentences[idx].text).word for word in words: if word.DEPREL == '核心关系' and word.CPOSTAG == 'v': phrases.extend(extract_event_detail(word, words)) phrases.reverse() # 因为extract_event_detail()是递归调用 # print(len(phrases)) return phrases
def get_hanlp_spo_weight_dict(prep_article, sentence_type='original', sentence_count=4): """ 对每篇文章title和中心句子提取subject, object, predicate 对subject, object, predicate计算每个词的权重 :param prep_article: PreprocessArticle类的实例 :param sentence_type: 待提取的句子的排序方法,取值为 original/score, 分别代表 文章原始句子顺序,文章句子评分排序 :param sentence_count: 中心句的个数,默认为4,如果为0,这只对title进行提取 :return: subject, object, predicate的词-权重字典,phrase列表 """ subs = [] objs = [] predicates = [] # phrases = [] # 文章title words = HanLP.parseDependency(prep_article.title).word subs.append(get_hanlp_sub_entity(words)) objs.append(get_hanlp_obj_entity(words)) predicates.append(get_hanlp_predicate_entity(words)) # 文章句子 if sentence_count > 0: # 文章前n个句子 if sentence_type == 'original': for i, sentence in enumerate(prep_article.sentences): if i < sentence_count: words = HanLP.parseDependency(sentence.text).word subs.append(get_hanlp_sub_entity(words)) objs.append(get_hanlp_obj_entity(words)) predicates.append(get_hanlp_predicate_entity(words)) # 文章得分降序前n个句子 if sentence_type == 'score': for i, idx in enumerate(prep_article.descend_sentence_index): if i < sentence_count: words = HanLP.parseDependency( prep_article.sentences[idx].text).word subs.append(get_hanlp_sub_entity(words)) objs.append(get_hanlp_obj_entity(words)) predicates.append(get_hanlp_predicate_entity(words)) sub_weight_dict = calculate_weight(subs) obj_weight_dict = calculate_weight(objs) predicate_weight_dict = calculate_weight(predicates) return sub_weight_dict, obj_weight_dict, predicate_weight_dict
def __init__(self, text): """ Takes raw text data as input then parse and store processed data :param text: raw text input """ # clean up html format non-sense self.text = html_cleanup(text) self.output_raw = str(HanLP.parseDependency(self.text)) self.parsed, self.core_ind = self.parse() self.core_ind_update()
def generate_feature_v4(self, input_list): feature = [] for sentence in input_list: sentence_feature = [[cut.DEPREL for _ in cut.LEMMA] for cut in HanLP.parseDependency(sentence)] inner_feature = [] for sf in sentence_feature: inner_feature += sf assert len(sentence) == len(inner_feature) feature.append(inner_feature) return feature
def hanlp_parse(text): ha2stanford_dict = {} with open('datasets/ha2stanford') as f: lines = f.readlines() for line in lines: line = line.strip().split('\t') ha2stanford_dict[line[0]] = line[1] ha_parse_result = HanLP.parseDependency(text) words = [] for i in ha_parse_result.word: word = Word(i.LEMMA, ha2stanford_dict[i.CPOSTAG], i.CPOSTAG) words.append(word) parse_result = Parse_result(words) return parse_result
def get_sentence_event_detail(sentence, n=-1): """ 对句子通过"依存句法分析"提取"主谓宾"短语 :param sentence: 输入的句子 :param n: 指定返回句子Phrase的个数 :return: Phrase实例列表 """ phrases = [] words = HanLP.parseDependency(sentence).word for word in words: if word.DEPREL == '核心关系' and word.CPOSTAG == 'v': # 核心关系词需要是动词,要不然没有提取的必要 if n == -1: phrases.extend(extract_event_detail(word, words)) else: # phrases.extend(extract_event_detail(word, words)[0]) phrases.append(extract_event_detail(word, words)[0]) return phrases
def do_GET(self): params = parse_qs(urlparse(self.path).query) self._set_headers() # {'text': ['I looove iparser!']} titles = [ 'HanLP是面向生产环境的自然语言处理工具包。', '上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。', # '词法分析包括中文分词、词性标注和命名实体识别。', # '本页面词法分析采用的是感知机算法。', '剑桥分析公司多位高管对卧底记者说,他们确保了唐纳德·特朗普在总统大选中获胜。', '收件人在万博·齐都国际绿茵花园(东门)A8栋,靠近泰山护理职业学院。', # '双桥街道双桥社区劳动和社会保障工作站地址是扬州市四望亭路293号双桥村4楼。', '他在浙江金华出生,他的名字叫金华。', '总统普京与特朗普通电话讨论美国太空探索技术公司。', '云南丽江多措并举推进“河长制”取得实效', '微软公司於1975年由比爾·蓋茲和保羅·艾倫創立,18年啟動以智慧雲端、前端為導向的大改組。', '北京大学计算语言学研究所和富士通研究开发中心有限公司,得到了人民日报社新闻信息中心的语料库。', # '可以自由设置句法分析模块中的分词算法。', # '敬请注意本页面不接受过长的句子。', # '徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。', '萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。', '中央人民政府驻澳门特别行政区联络办公室的张朝阳说商品和服务是三原县鲁桥食品厂的主营业务。' ] sentence = random.choice(titles) if SENTENCE in params: s = params[SENTENCE] if len(s): sentence = s[0].strip() punctuation = re.compile('[。!?!?]') sentence = sentence[:len(punctuation.split(sentence)[0]) + 1] sentence = sentence.replace(' ', '') MAX_LENGTH = 50 if len(sentence) > MAX_LENGTH: sentence = '请输入{}字以内的句子'.format(MAX_LENGTH) ann = '词法分析可视化仅支持 HanLP 1.6.2及以上版本' if lexical_analyzer: ann = lexical_analyzer.analyze( sentence).translateCompoundWordLabels().toStandoff( True).__str__() conll = HanLP.parseDependency(sentence).__str__() self.write( TEMPLATE.replace('{SENTENCE}', sentence).replace( '{CONLL}', quote(conll)).replace('{HANLP_GOOGLE_UA}', HANLP_GOOGLE_UA, 1).replace('{ANN}', quote(ann)))
def do_GET(self): params = parse_qs(urlparse(self.path).query) self._set_headers() # {'text': ['I looove iparser!']} sentence = 'HanLP是面向生产环境的自然语言处理工具包。' if SENTENCE in params: s = params[SENTENCE] if len(s): sentence = s[0].strip() punctuation = re.compile('[。!?!?]') sentence = sentence[:len(punctuation.split(sentence)[0]) + 1] MAX_LENGTH = 50 if len(sentence) > MAX_LENGTH: sentence = '请输入{}字以内的句子'.format(MAX_LENGTH) conll = quote(HanLP.parseDependency(sentence).__str__()) self.write(TEMPLATE.replace('{SENTENCE}', sentence).replace('{CONLL}', conll) .replace('{HANLP_GOOGLE_UA}', HANLP_GOOGLE_UA, 1) )
def do_GET(self): params = parse_qs(urlparse(self.path).query) self._set_headers() # {'text': ['I looove iparser!']} titles = [ 'HanLP是面向生产环境的自然语言处理工具包。', '上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。', # '词法分析包括中文分词、词性标注和命名实体识别。', # '本页面词法分析采用的是感知机算法。', '剑桥分析公司多位高管对卧底记者说,他们确保了唐纳德·特朗普在总统大选中获胜。', '收件人在万博·齐都国际绿茵花园(东门)A8栋,靠近泰山护理职业学院。', '双桥街道双桥社区劳动和社会保障工作站地址是扬州市四望亭路293号双桥村4楼。', # '可以自由设置句法分析模块中的分词算法。', # '敬请注意本页面不接受过长的句子。', # '徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。', '香港特别行政区的张朝阳说商品和服务是三原县鲁桥食品厂的主营业务。' ] sentence = random.choice(titles) if SENTENCE in params: s = params[SENTENCE] if len(s): sentence = s[0].strip() punctuation = re.compile('[。!?!?]') sentence = sentence[:len(punctuation.split(sentence)[0]) + 1] MAX_LENGTH = 50 if len(sentence) > MAX_LENGTH: sentence = '请输入{}字以内的句子'.format(MAX_LENGTH) ann = '词法分析可视化仅支持 HanLP 1.6.2及以上版本' if lexical_analyzer: ann = lexical_analyzer.analyze( sentence).translateCompoundWordLabels().toStandoff().__str__() conll = HanLP.parseDependency(sentence).__str__() self.write( TEMPLATE.replace('{SENTENCE}', sentence).replace( '{CONLL}', quote(conll)).replace('{HANLP_GOOGLE_UA}', HANLP_GOOGLE_UA, 1).replace('{ANN}', quote(ann)))
def hannlp_parse(string): ''' hannlp依存句法分析 :param string: :return: ''' result = HanLP.parseDependency(string) result = [ele.split("\t") for ele in str(result).strip().split("\n")] fenci = [] first_path = '' prons = '' pron_cnt = 0 words = [] for element in result: fenci.append(str(element[1])) if int(element[0]) < int(element[6]): direction = '+' else: direction = '-' if str(element[7]) not in ['左附加关系', '右附加关系','标点符号']: first_path += str(element[7]) + direction # cnt - if str(element[1]) in all_pronouns: prons += str(element[1]) pron_cnt += 1 else: words.append(str(element[1])) represent = {'f1': first_path} core_word = {'words':words,'prons':prons,'pron_cnt': pron_cnt} return fenci, represent, core_word
# 3 的 的 u u _ 2 右附加关系 _ _ # 4 愿望 愿望 n n _ 5 主谓关系 _ _ # 5 是 是 v v _ 0 核心关系 _ _ # 6 什么 什么 r r _ 5 动宾关系 _ _ sentence = '你姐姐的名字是什么?' # 1 你 你 r r _ 2 定中关系 _ _ # 2 姐姐 姐姐 n n _ 4 定中关系 _ _ # 3 的 的 u u _ 2 右附加关系 _ _ # 4 名字 名字 n n _ 5 主谓关系 _ _ # 5 是 是 v v _ 0 核心关系 _ _ # 6 什么 什么 r r _ 5 动宾关系 _ _ # 7 ? ? wp w _ 5 标点符号 _ _ from pyhanlp import HanLP result = HanLP.parseDependency(sentence) print(result) from pronounMatch import hannlp_parse, pronoun_extract print(hannlp_parse(sentence)) print(pronoun_extract.extract(sentence)) # from pronounMatch import PronounMatch # file_path = 'D:\\pronounMatch\\pronoun_dependency_rule.txt' # pronounmatch = PronounMatch(file_path) # print('+' + str(pronounmatch.match(sentence)) + '+') # assert 'rule_0' in pronounmatch.match('你喜欢什么?')['more_pronoun'].keys() # # assert 'rule_1' in pronounmatch.match('你喜欢我?')['more_pronoun'].keys() # assert 'rule_1' in pronounmatch.match('你可以撩我吗?')['more_pronoun'].keys()
def main(): if len(sys.argv) == 1: sys.argv.append('--help') arg_parser = argparse.ArgumentParser( description='HanLP: Han Language Processing v{}'.format( HANLP_JAR_VERSION)) arg_parser.add_argument('-v', '--version', required=False, action='store_true', help='show installed versions of HanLP') task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') segment_parser = task_parser.add_parser(name='segment', help='word segmentation') parse_parser = task_parser.add_parser(name='parse', help='dependency parsing') server_parser = task_parser.add_parser( name='serve', help='start http server', description='A http server for HanLP') server_parser.add_argument('--port', type=int, default=8765) update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP') def add_args(p): p.add_argument("--config", default=PATH_CONFIG, help='path to hanlp.properties') # p.add_argument("--action", dest="action", default='predict', # help='Which action (train, test, predict)?') add_args(segment_parser) add_args(parse_parser) if '-v' in sys.argv or '--version' in sys.argv: print('jar {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH)) data_version = hanlp_installed_data_version() print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH)) print('config : {}'.format( os.path.join(STATIC_ROOT, 'hanlp.properties'))) exit(0) args = arg_parser.parse_args() def die(msg): eprint(msg) exit(1) if hasattr(args, 'config') and args.config: if os.path.isfile(args.config): JClass('com.hankcs.hanlp.utility.Predefine' ).HANLP_PROPERTIES_PATH = args.config else: die('Can\'t find config file {}'.format(args.config)) if args.task == 'segment': for line in sys.stdin: line = line.strip() print(' '.join(term.toString() for term in HanLP.segment(any2utf8(line)))) elif args.task == 'parse': for line in sys.stdin: line = line.strip() print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'serve': if PY == 3: from pyhanlp import server server.run(port=args.port) else: die('现在server.py暂时不支持Python2,欢迎参与移植') elif args.task == 'update': update_hanlp()
def extractKeyword(self, sent, num=2): """ 抽取关键词 :param sent: :return: """ res = HanLP.extractKeyword(sent, num) res_list = [] for word in res: if self.stopwords.__contains__(word): continue else: res_list.append(word) return res_list if __name__ == "__main__": sr = HanlpSplitor() #sr.split('孽息') #sr.split_test('') #content = '根据刑事诉讼法、民事诉讼法和行政诉讼法的规定,有四类案件实行不公开审理。下列哪一项不属于不公开审理的范围?( )' #print(HanLP.extractKeyword(content, 5)) res = HanLP.parseDependency("《灵与肉》的作者是?") pinyin = sr.pinyin("国际金融") #print res sen = u' :: 王某被取保候审,则他不应:( ) 答案: 要求证人丁某为他说好话, 未经批准到外省出差' res = sr.split1list(sen) sen = u' 王某被取保候审,则他不应:( ) 答案: 要求证人丁某为他说好话, 未经批准到外省出差' res = sr.splitlist_can_repeat(sen) print res print 'split over'
def parser_main(self, sentence): arcs = HanLP.parseDependency(sentence).word words, postags, child_dict_list, format_parse_list = self.build_parse_child_dict(arcs) return words, postags, child_dict_list, format_parse_list
for i in range(len(words)): # ['定中关系', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return words, postags, child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): arcs = HanLP.parseDependency(sentence).word words, postags, child_dict_list, format_parse_list = self.build_parse_child_dict(arcs) return words, postags, child_dict_list, format_parse_list if __name__ == "__main__": sentence = "李克强总理今天来我家了,我感到非常荣幸" sentence2 = "以色列国防军20日对加沙地带实施轰炸,造成3名巴勒斯坦武装人员死亡。此外,巴勒斯坦人与以色列士兵当天在加沙地带与以交界地区发生冲突,一名巴勒斯坦人被打死。当天的冲突还造成210名巴勒斯坦人受伤。当天,数千名巴勒斯坦人在加沙地带边境地区继续“回归大游行”抗议活动。部分示威者燃烧轮胎,并向以军投掷石块、燃烧瓶等,驻守边境的以军士兵向示威人群发射催泪瓦斯并开枪射击。" # 分词和词性标注 # terms = HanLP.segment(sentence) # for term in terms: # print(term.word, term.nature) # 依存句法分析 ret_dep = HanLP.parseDependency(sentence) print(ret_dep) extractor = TripleExtractor() svos = extractor.triples_main(sentence) print(svos)
from pyhanlp import HanLP s = '会议宣布了首批资深院士名单' dp = HanLP.parseDependency(s) print(dp) from pyhanlp import * print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) for term in HanLP.segment('下雨天地面积水'): print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] for sentence in testCases: print(HanLP.segment(sentence)) print("# 关键词提取") document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 2)) print("# 自动摘要") print(HanLP.extractSummary(document, 3)) print("# 依存句法分析") print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) doc = "句法分析是自然语言处理中的关键技术之一,其基本任务是确定句子的句法结构或者句子中词汇之间的依存关系。\ 主要包括两方面的内容,一是确定语言的语法体系,即对语言中合法的句子的语法结构给与形式化的定义;另一方面是句法分析技术,即根据给定的语法体系,自动推导出句子的句法结构,分析句子所包含的句法单位和这些句法单位之间的关系。"
def Get_dependency(sentence): return HanLP.parseDependency(sentence)
parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 from pyltp import SementicRoleLabeller labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 #arcs = parser.parse(words, postags) # 句法分析 #print "依存句法\n", '-'*80, "\n" #print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) from pyhanlp import HanLP print(HanLP.parseDependency(text)) ''' roles = labeller.label(words, postags, arcs) # 语义角色标注 print "SRL\n", '-'*80, "\n" # 打印结果 for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) print words[role.index] labeller.release() # 释放模型 '''
def main(): if len(sys.argv) == 1: sys.argv.append('--help') arg_parser = argparse.ArgumentParser( description='HanLP: Han Language Processing v{}'.format( HANLP_JAR_VERSION)) arg_parser.add_argument('-v', '--version', required=False, action='store_true', help='show installed versions of HanLP') task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') segment_parser = task_parser.add_parser(name='segment', help='word segmentation') tag_parser = segment_parser.add_mutually_exclusive_group(required=False) tag_parser.add_argument('--tag', dest='tag', action='store_true', help='show part-of-speech tags') tag_parser.add_argument('--no-tag', dest='tag', action='store_false', help='don\'t show part-of-speech tags') segment_parser.set_defaults(tag=True) segment_parser.add_argument( '-a', '--algorithm', type=str, default='viterbi', help='algorithm of segmentation e.g. perceptron') parse_parser = task_parser.add_parser(name='parse', help='dependency parsing') parse_keyword = task_parser.add_parser(name='keyword', help='dependency Keyword') parse_summary = task_parser.add_parser(name='summary', help='dependency summary') server_parser = task_parser.add_parser( name='serve', help='start http server', description='A http server for HanLP') server_parser.add_argument('--port', type=int, default=8765) update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP') def add_args(p): p.add_argument("--config", default=PATH_CONFIG, help='path to hanlp.properties') # p.add_argument("--action", dest="action", default='predict', # help='Which action (train, test, predict)?') add_args(segment_parser) add_args(parse_parser) add_args(parse_keyword) add_args(parse_summary) if '-v' in sys.argv or '--version' in sys.argv: print('jar {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH)) data_version = hanlp_installed_data_version() print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH)) print('config : {}'.format( os.path.join(STATIC_ROOT, 'hanlp.properties'))) exit(0) args = arg_parser.parse_args() def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def die(msg): eprint(msg) exit(1) if hasattr(args, 'config') and args.config: if os.path.isfile(args.config): JClass('com.hankcs.hanlp.utility.Predefine' ).HANLP_PROPERTIES_PATH = args.config else: die('Can\'t find config file {}'.format(args.config)) if args.task == 'segment': segmenter = None try: segmenter = HanLP.newSegment(args.algorithm) except JException as e: if isinstance(e, java.lang.IllegalArgumentException): die('invalid algorithm {}'.format(args.algorithm)) elif isinstance(e, java.lang.RuntimeException): die('failed to load required model') else: die('unknown exception {}'.format(repr(e))) is_lexical_analyzer = hasattr(segmenter, 'analyze') if not args.tag: if is_lexical_analyzer: segmenter.enablePartOfSpeechTagging(False) JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False else: JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False for line in sys.stdin: line = line.strip() print(' '.join(term.toString() for term in segmenter.seg(any2utf8(line)))) elif args.task == 'parse': for line in sys.stdin: line = line.strip() print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'keyword': for line in sys.stdin: line = line.strip() TextRankKeyword = JClass( "com.hankcs.hanlp.summary.TextRankKeyword") keyword_list = HanLP.extractKeyword(line, 3) print(keyword_list) #print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'summary': for line in sys.stdin: line = line.strip() TextRankSentence = JClass( "com.hankcs.hanlp.summary.TextRankSentence") sentence_list = HanLP.extractSummary(line, 3) print(sentence_list) elif args.task == 'serve': if PY == 3: from pyhanlp import server server.run(port=args.port) else: die('现在server.py暂时不支持Python2,欢迎参与移植') elif args.task == 'update': if hanlp_installed_data_version() == '手动安装': die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量') else: from pyhanlp.static import update_hanlp update_hanlp()
def hannlp_parse(string, pron_cnt=0): ''' hannlp依存句法分析 :param string: :return: ''' result = HanLP.parseDependency(string) result = [ele.split("\t") for ele in str(result).strip().split("\n")] depend = [] depend_loc = [] fenci = [] fenci_loc = [] fenci_depend_loc = [] postag = [] for element in result: fenci.append(str(element[1])) fenci_loc.append(int(element[0])) postag.append(str(element[3])) depend.append((str(element[7]))) depend_loc.append(int(element[6])) fenci_depend_loc.append((str(element[1]), int(element[6]))) head_loc = depend.index('核心关系') + 1 first_path = '' subject_word = '' subject_cnt = 0 last_subject_loc = 0 head_word = '' another_pron = '' another_pron_relate = '' action_word = '' prons = [] for element in result: if int(element[0]) < int(element[6]): direction = '+' else: direction = '-' # path - sv if str(element[7]) in ['主谓关系', '核心关系']: first_path += str(element[1]) + direction if str(element[7]) == '主谓关系': subject_word += str(element[1]) last_subject_loc = int(element[0]) - 1 if str(element[7]) == '核心关系': head_word = str(element[1]) if str(element[1]) in all_pronouns and str(element[7]) == '主谓关系': subject_cnt += 1 # word - if str(element[1]) in all_pronouns and str(element[7]) == '主谓关系': another_pron += str(element[1]) another_pron_relate += str(element[7]) # word - if str(element[7]) == '动宾关系' and str(element[3]) not in ['r'] and head_loc == int(element[6]): action_word = str(element[1]) # cnt - if str(element[1]) in all_pronouns: pron_cnt += 1 prons.append(str(element[1])) # 一个句子多个主谓关系,只保留最后一个主谓关系 if subject_cnt > 1: return hannlp_parse(''.join(fenci[last_subject_loc:]), pron_cnt=pron_cnt) represent = {'f1': first_path} core_word = {'subject_word': subject_word, 'head_word': head_word, 'another_pron': another_pron, 'another_pron_relate': another_pron_relate, 'action_word': action_word, 'pron_cnt': pron_cnt} return fenci, represent, core_word
end_time = time.time() term_list = CRFnewSegment.seg(text) print(term_list) print('分词+词性标注 Took %f second' % (end_time - start_time)) # print([str(i.word) for i in term_list]) # print([str(i.nature) for i in term_list]) start_time = time.time() seg_result = HanLP.segment("不要") end_time = time.time() print(' '.join('%s/%s' % (term.word, term.nature) for term in seg_result)) print('分词+词性标注 Took %f second' % (end_time - start_time)) # 依存分析 start_time = time.time() sentence = HanLP.parseDependency('万有引力是什么') end_time = time.time() print(sentence) print('依存分析 Took %f second' % (end_time - start_time)) # # import jpype # from jpype import * # # startJVM(getDefaultJVMPath(), "-Djava.class.path=D:\python_projects\zhengzebiaodashi\hanlp\hanlp-1.3.4.jar;D:\python_projects\zhengzebiaodashi\hanlp", # "-Xms1g", # "-Xmx1g") # 启动JVM,Linux需替换分号;为冒号: # # print("=" * 30 + "HanLP分词" + "=" * 30) # HanLP = JClass('com.hankcs.hanlp.HanLP') # # 中文分词 # print(HanLP.segment('你好,欢迎在Python中调用HanLP的API'))
def dependency_relation(sentence): relation = HanLP.parseDependency(sentence) return relation
# jieba.analyse.set_idf_path("../extra_dict/idf.txt.big"); withWeight = True print('\ntf-idf for keys: ') tags = jieba.analyse.extract_tags(sent, topK=10, withWeight=withWeight) if withWeight: for tag in tags: print("tag: %s\t weight: %f" % (tag[0], tag[1])) else: print(tags) # 基于 TextRank 算法的关键词抽取 print('\ntextrank for keys: ') tags_other = jieba.analyse.textrank(sent, withWeight=withWeight) if withWeight: for tag in tags_other: print("tag: %s\t weight: %f" % (tag[0], tag[1])) else: print(tags_other) ## pyhanlp from pyhanlp import HanLP # (繁体->简体,全角->半角,大写->小写) # HanLP.Config.Normalization = True text = '你好,欢迎在Python中调用HanLP的API!' pas = HanLP.parseDependency(text) serializer = [i.LEMMA for i in pas] parsing = [i.HEAD.ID for i in pas] assert len(serializer) == len(parsing) print(serializer) print(parsing)