def extractSummary(document, size, sentence_separator=None): """ * 自动摘要 * * @param document 目标文档 * @param size 需要的关键句的个数 * @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;] * @return 关键句列表 """ if sentence_separator: return HanLP.extractSummary(document, size, sentence_separator) else: return HanLP.extractSummary(document, size)
def split_test(self, sentence): #line = sentence.strip().decode('utf-8', 'ignore') # 去除每行首尾可能出现的空格,并转为Unicode进行处理 #line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".decode("utf8"), # " ".decode("utf8"), line) #wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) for term in HanLP.segment('下雨天地面积水'): print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] for sentence in testCases: print(HanLP.segment(sentence)) # 关键词提取 document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 2)) # 自动摘要 print(HanLP.extractSummary(document, 3)) # 依存句法分析 print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
from pyhanlp import * print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) for term in HanLP.segment('下雨天地面积水'): print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] for sentence in testCases: print(HanLP.segment(sentence)) print("# 关键词提取") document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 2)) print("# 自动摘要") print(HanLP.extractSummary(document, 3)) print("# 依存句法分析") print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) doc = "句法分析是自然语言处理中的关键技术之一,其基本任务是确定句子的句法结构或者句子中词汇之间的依存关系。\ 主要包括两方面的内容,一是确定语言的语法体系,即对语言中合法的句子的语法结构给与形式化的定义;另一方面是句法分析技术,即根据给定的语法体系,自动推导出句子的句法结构,分析句子所包含的句法单位和这些句法单位之间的关系。" print("关键词") print(HanLP.extractKeyword(doc, 2)) print("# 自动摘要") print(HanLP.extractSummary(doc, 3))
def main(): if len(sys.argv) == 1: sys.argv.append('--help') arg_parser = argparse.ArgumentParser( description='HanLP: Han Language Processing v{}'.format( HANLP_JAR_VERSION)) arg_parser.add_argument('-v', '--version', required=False, action='store_true', help='show installed versions of HanLP') task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') segment_parser = task_parser.add_parser(name='segment', help='word segmentation') tag_parser = segment_parser.add_mutually_exclusive_group(required=False) tag_parser.add_argument('--tag', dest='tag', action='store_true', help='show part-of-speech tags') tag_parser.add_argument('--no-tag', dest='tag', action='store_false', help='don\'t show part-of-speech tags') segment_parser.set_defaults(tag=True) segment_parser.add_argument( '-a', '--algorithm', type=str, default='viterbi', help='algorithm of segmentation e.g. perceptron') parse_parser = task_parser.add_parser(name='parse', help='dependency parsing') parse_keyword = task_parser.add_parser(name='keyword', help='dependency Keyword') parse_summary = task_parser.add_parser(name='summary', help='dependency summary') server_parser = task_parser.add_parser( name='serve', help='start http server', description='A http server for HanLP') server_parser.add_argument('--port', type=int, default=8765) update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP') def add_args(p): p.add_argument("--config", default=PATH_CONFIG, help='path to hanlp.properties') # p.add_argument("--action", dest="action", default='predict', # help='Which action (train, test, predict)?') add_args(segment_parser) add_args(parse_parser) add_args(parse_keyword) add_args(parse_summary) if '-v' in sys.argv or '--version' in sys.argv: print('jar {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH)) data_version = hanlp_installed_data_version() print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH)) print('config : {}'.format( os.path.join(STATIC_ROOT, 'hanlp.properties'))) exit(0) args = arg_parser.parse_args() def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def die(msg): eprint(msg) exit(1) if hasattr(args, 'config') and args.config: if os.path.isfile(args.config): JClass('com.hankcs.hanlp.utility.Predefine' ).HANLP_PROPERTIES_PATH = args.config else: die('Can\'t find config file {}'.format(args.config)) if args.task == 'segment': segmenter = None try: segmenter = HanLP.newSegment(args.algorithm) except JException as e: if isinstance(e, java.lang.IllegalArgumentException): die('invalid algorithm {}'.format(args.algorithm)) elif isinstance(e, java.lang.RuntimeException): die('failed to load required model') else: die('unknown exception {}'.format(repr(e))) is_lexical_analyzer = hasattr(segmenter, 'analyze') if not args.tag: if is_lexical_analyzer: segmenter.enablePartOfSpeechTagging(False) JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False else: JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False for line in sys.stdin: line = line.strip() print(' '.join(term.toString() for term in segmenter.seg(any2utf8(line)))) elif args.task == 'parse': for line in sys.stdin: line = line.strip() print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'keyword': for line in sys.stdin: line = line.strip() TextRankKeyword = JClass( "com.hankcs.hanlp.summary.TextRankKeyword") keyword_list = HanLP.extractKeyword(line, 3) print(keyword_list) #print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'summary': for line in sys.stdin: line = line.strip() TextRankSentence = JClass( "com.hankcs.hanlp.summary.TextRankSentence") sentence_list = HanLP.extractSummary(line, 3) print(sentence_list) elif args.task == 'serve': if PY == 3: from pyhanlp import server server.run(port=args.port) else: die('现在server.py暂时不支持Python2,欢迎参与移植') elif args.task == 'update': if hanlp_installed_data_version() == '手动安装': die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量') else: from pyhanlp.static import update_hanlp update_hanlp()
def extract_summa(s): summa = HanLP.extractSummary(s, 15, r'[ ]') return [x for x in summa if 10< len(x) <40]
return [s for s in summaries if 'c' not in extract_s_nature(s) and 'cc' not in extract_s_nature(s)] sql = "SELECT distinct game_id FROM game_source.s_game_comments_taptap_game WHERE source='taptap'" game_id_list = from_sql(sql) game_s = pd.DataFrame() from datetime import datetime for game_id in tqdm(list(game_id_list['game_id'])): sql = """ SELECT source, game_id, game_name, content FROM game_source.s_game_comments_taptap_game where game_id = %s and length(content)>300 """ % (game_id, ) df = from_sql(sql) df['content'] = df['content'].apply(clear_text) df['content'] = df['content'].apply(get_help_content) df['summaries']=df['content'].apply(lambda x: list(HanLP.extractSummary(x, 1, r'[ ]'))) df_new = df[['source', 'game_id', 'game_name', 'summaries']].groupby(['source', 'game_id','game_name']).agg(join).reset_index() #做了两次的提取摘要, try: df_new['summaries'] = df_new['summaries'].apply(extract_summa) df_new['summaries'] = df_new['summaries'].apply(clear_summa) except: print('summar is []') continue if df_new.iloc[0]['summaries'] == []: continue trg_db='game_process' trg_table='c_lcs_game_comment_summary'