def split_test(self, sentence): #line = sentence.strip().decode('utf-8', 'ignore') # 去除每行首尾可能出现的空格,并转为Unicode进行处理 #line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".decode("utf8"), # " ".decode("utf8"), line) #wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) for term in HanLP.segment('下雨天地面积水'): print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] for sentence in testCases: print(HanLP.segment(sentence)) # 关键词提取 document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 2)) # 自动摘要 print(HanLP.extractSummary(document, 3)) # 依存句法分析 print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
def show_words(): sql = 'SELECT * FROM NEWSWB' lock.acquire() cursor.execute(sql) lock.release() news = cursor.fetchone() print(news[5], '>>>>>>>', HanLP.extractKeyword(news[5], 5))
def get_keyword(content,keynum=2): """ 获取每个问题中的关键字,关键词的数目由keynum控制 :param content: 一个句子 :return: """ keywordList = HanLP.extractKeyword(content,keynum) return keywordList
def extractKeyword(document, size): """ * 提取关键词 * * @param document 文档内容 * @param size 希望提取几个关键词 * @return 一个列表 """ return HanLP.extractKeyword(document, size)
def get_keywords(): try: sql = 'SELECT * FROM NEWSWB' lock.acquire() cursor.execute(sql) lock.release() news = cursor.fetchall() for n in news: insert_keywords_into_mysql(HanLP.extractKeyword(n[5], 5), n[0]) print(n[0], ' finish') except IOError as err: print(err)
def extractKeyword(self, sent, num=2): """ 抽取关键词 :param sent: :return: """ res = HanLP.extractKeyword(sent, num) res_list = [] for word in res: if self.stopwords.__contains__(word): continue else: res_list.append(word) return res_list
def siglerow(text, keyword_num=1): text = text[text.find(':') + 1:] text = ','.join(re.compile(r'[\u4e00-\u9fa5]+').findall(text)) kw1 = analyse.textrank(text,keyword_num,allowPOS=('ns','n','vn','b')) kw2 = HanLP.extractKeyword(text,1) k_words = HanLP.segment(text) kw3 = '' for i in k_words: if str(i.nature)[0] == 'n': kw3 = str(i.word) break kw4 = analyse.textrank(text,keyword_num) if kw1: return str(kw1[0]) # jieba关键词,仅限于地点,名词,动名词 elif kw2: return str(kw2[0]) # hanlp关键词 elif kw3: return kw3 # 仅限于地点,名词,动名词 elif kw4: return str(kw4[0]) else: return ''
from pyhanlp import * print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) for term in HanLP.segment('下雨天地面积水'): print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] for sentence in testCases: print(HanLP.segment(sentence)) print("# 关键词提取") document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 2)) print("# 自动摘要") print(HanLP.extractSummary(document, 3)) print("# 依存句法分析") print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")) doc = "句法分析是自然语言处理中的关键技术之一,其基本任务是确定句子的句法结构或者句子中词汇之间的依存关系。\ 主要包括两方面的内容,一是确定语言的语法体系,即对语言中合法的句子的语法结构给与形式化的定义;另一方面是句法分析技术,即根据给定的语法体系,自动推导出句子的句法结构,分析句子所包含的句法单位和这些句法单位之间的关系。" print("关键词") print(HanLP.extractKeyword(doc, 2)) print("# 自动摘要") print(HanLP.extractSummary(doc, 3))
def extract_keyword(item): words=HanLP.extractKeyword(item,1) if(str(words)=='[]'): return item else : return str(words[0])
def main(): if len(sys.argv) == 1: sys.argv.append('--help') arg_parser = argparse.ArgumentParser( description='HanLP: Han Language Processing v{}'.format( HANLP_JAR_VERSION)) arg_parser.add_argument('-v', '--version', required=False, action='store_true', help='show installed versions of HanLP') task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') segment_parser = task_parser.add_parser(name='segment', help='word segmentation') tag_parser = segment_parser.add_mutually_exclusive_group(required=False) tag_parser.add_argument('--tag', dest='tag', action='store_true', help='show part-of-speech tags') tag_parser.add_argument('--no-tag', dest='tag', action='store_false', help='don\'t show part-of-speech tags') segment_parser.set_defaults(tag=True) segment_parser.add_argument( '-a', '--algorithm', type=str, default='viterbi', help='algorithm of segmentation e.g. perceptron') parse_parser = task_parser.add_parser(name='parse', help='dependency parsing') parse_keyword = task_parser.add_parser(name='keyword', help='dependency Keyword') parse_summary = task_parser.add_parser(name='summary', help='dependency summary') server_parser = task_parser.add_parser( name='serve', help='start http server', description='A http server for HanLP') server_parser.add_argument('--port', type=int, default=8765) update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP') def add_args(p): p.add_argument("--config", default=PATH_CONFIG, help='path to hanlp.properties') # p.add_argument("--action", dest="action", default='predict', # help='Which action (train, test, predict)?') add_args(segment_parser) add_args(parse_parser) add_args(parse_keyword) add_args(parse_summary) if '-v' in sys.argv or '--version' in sys.argv: print('jar {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH)) data_version = hanlp_installed_data_version() print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH)) print('config : {}'.format( os.path.join(STATIC_ROOT, 'hanlp.properties'))) exit(0) args = arg_parser.parse_args() def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def die(msg): eprint(msg) exit(1) if hasattr(args, 'config') and args.config: if os.path.isfile(args.config): JClass('com.hankcs.hanlp.utility.Predefine' ).HANLP_PROPERTIES_PATH = args.config else: die('Can\'t find config file {}'.format(args.config)) if args.task == 'segment': segmenter = None try: segmenter = HanLP.newSegment(args.algorithm) except JException as e: if isinstance(e, java.lang.IllegalArgumentException): die('invalid algorithm {}'.format(args.algorithm)) elif isinstance(e, java.lang.RuntimeException): die('failed to load required model') else: die('unknown exception {}'.format(repr(e))) is_lexical_analyzer = hasattr(segmenter, 'analyze') if not args.tag: if is_lexical_analyzer: segmenter.enablePartOfSpeechTagging(False) JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False else: JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False for line in sys.stdin: line = line.strip() print(' '.join(term.toString() for term in segmenter.seg(any2utf8(line)))) elif args.task == 'parse': for line in sys.stdin: line = line.strip() print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'keyword': for line in sys.stdin: line = line.strip() TextRankKeyword = JClass( "com.hankcs.hanlp.summary.TextRankKeyword") keyword_list = HanLP.extractKeyword(line, 3) print(keyword_list) #print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'summary': for line in sys.stdin: line = line.strip() TextRankSentence = JClass( "com.hankcs.hanlp.summary.TextRankSentence") sentence_list = HanLP.extractSummary(line, 3) print(sentence_list) elif args.task == 'serve': if PY == 3: from pyhanlp import server server.run(port=args.port) else: die('现在server.py暂时不支持Python2,欢迎参与移植') elif args.task == 'update': if hanlp_installed_data_version() == '手动安装': die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量') else: from pyhanlp.static import update_hanlp update_hanlp()
from pyhanlp import HanLP texts = "中国是一个文明古国,拥有56个民族,文化历史厚重。" # 分词 word_cut = HanLP.segment(texts) print("分词结果:\n", word_cut) for term in word_cut: print("单词:%s; 词性:%s " % (term.word, term.nature)) testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] # for sentence in testCases: # print(HanLP.segment(sentence)) # 关键词提取 document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 5)) # 自动摘要 print(HanLP.extractSummary(document, 3)) # 依存句法分析 print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
def hanlp_keyword(instr): # Text Rank return HanLP.extractKeyword(instr, 10)