def result_format(): HanLP.Config.ShowTermNature = False seg = HanLP.newSegment() print(seg.seg(sentences[0])) HanLP.Config.ShowTermNature = True seg = HanLP.newSegment() term_list = seg.seg(sentences[0]) print(term_list) print([str(i.word) for i in term_list]) print([str(i.nature) for i in term_list])
def pos_filter(self, s): if not s: return [] wds = [w.word for w in HanLP.segment(s)] pos = [str(w.nature) for w in HanLP.segment(s) if w.nature] if len(''.join(wds)) < 2: return [] if 'n' not in pos and 'nhd' not in pos: return [] return ''.join(wds)
def extractSummary(document, size, sentence_separator=None): """ * 自动摘要 * * @param document 目标文档 * @param size 需要的关键句的个数 * @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;] * @return 关键句列表 """ if sentence_separator: return HanLP.extractSummary(document, size, sentence_separator) else: return HanLP.extractSummary(document, size)
def make_index(): with open(ITEM_INDEX_JSON, 'w', encoding='utf8') as item_index_file, \ open(ITEM_SOURCE_JSON, 'r', encoding='utf8') as item_file: item_js = json.load(item_file) all_info = item_js['RECORDS'] for item in all_info: title = item['TITLE'] ITEM_DICT[item['ENTERPRISE_ID']]['org_id'] = item['ORG_ID'] if 'items' not in ITEM_DICT[item['ENTERPRISE_ID']]: ITEM_DICT[item['ENTERPRISE_ID']]['items'] = set() # TODO: segment and filter here. segs = HanLP.segment(title) for word in segs: _word = word.word nature = str(word.nature) if nature in ['vn', 'vi']: ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word) elif nature == 'v' and _word in V_SET: ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word) elif nature in [ 'n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba' ] and _word not in FIL_SET: ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word) for key in ITEM_DICT.keys(): ITEM_DICT[key]['items'] = list(ITEM_DICT[key]['items']) js_info = json.dumps(ITEM_DICT) item_index_file.write(js_info) with open(TYPE_INDEX_JSON, 'w', encoding='utf8') as type_index_file, \ open(TYPE_SOURCE_JSON, 'r', encoding='utf8') as type_file: type_js = json.load(type_file) all_info = type_js['RECORDS'] for item in filter(lambda x: len(x['CODE']) == 9, all_info): TYPE_DICT[item['CODE']] = set() if item['SERVICETYPEVALUE']: value_words = HanLP.segment(item['SERVICETYPEVALUE']) for word in value_words: TYPE_DICT[item['CODE']].add(word.word) if item['KEYWORD']: key_words = HanLP.segment(item['KEYWORD']) for word in key_words: TYPE_DICT[item['CODE']].add(word.word) # convert set to list for k in TYPE_DICT.keys(): TYPE_DICT[k] = list(TYPE_DICT[k]) js_info = json.dumps(TYPE_DICT) type_index_file.write(js_info)
def getSummary(document, max_length, sentence_separator=None): """ * 自动摘要 * * @param document 目标文档 * @param max_length 需要摘要的长度 * @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;] * @return 摘要文本 """ if sentence_separator: return HanLP.getSummary(document, max_length, sentence_separator) else: return HanLP.getSummary(document, max_length)
def get_hanlp_entity_weight_dict(prep_article, entity_type, sentence_type='original', sentence_count=4): """ 对每篇文章title和中心句子提取 subject/object/predicate, 并对对应的类型计算每个词的权重 :param prep_article: PreprocessArticle类的实例 :param entity_type: 提取词的类型,取值为 sub/obj/predicate, 分别代表 subject, object, predicate :param sentence_type: 待提取的句子的排序方法,取值为 original/score, 分别代表 文章原始句子顺序,文章句子评分排序 :param sentence_count: 中心句的个数,默认为4,如果为0,这只对title进行提取 :return: subject/object/predicate的词-权重字典 """ entitis = [] # 文章title words = HanLP.parseDependency(prep_article.title).word if entity_type == 'sub': entitis.append(get_hanlp_sub_entity(words)) if entity_type == 'obj': entitis.append(get_hanlp_obj_entity(words)) if entity_type == 'predicate': entitis.append(get_hanlp_predicate_entity(words)) # 文章句子 if sentence_count > 0: # 文章前n个句子 if sentence_type == 'original': for i, sentence in enumerate(prep_article.sentences): if i < sentence_count: words = HanLP.parseDependency(sentence.text).word if entity_type == 'sub': entitis.append(get_hanlp_sub_entity(words)) if entity_type == 'obj': entitis.append(get_hanlp_obj_entity(words)) if entity_type == 'predicate': entitis.append(get_hanlp_predicate_entity(words)) # 文章得分降序前n个句子 if sentence_type == 'score': for i, idx in enumerate(prep_article.descend_sentence_index): if i < sentence_count: words = HanLP.parseDependency( prep_article.sentences[idx].text).word if entity_type == 'sub': entitis.append(get_hanlp_sub_entity(words)) if entity_type == 'obj': entitis.append(get_hanlp_obj_entity(words)) if entity_type == 'predicate': entitis.append(get_hanlp_predicate_entity(words)) entity_weight_dict = calculate_weight(entitis) return entity_weight_dict
def dependency_analysis(sent): result = HanLP.parseDependency(sent) ROOT, SUBJECT, PREDICATE = '核心关系', '主谓关系', '宾' res = dict() key = ['root', 'sub', 'pre', 'sub_adj', 'pre_adj', 'entity'] for word in result.iterator(): type = str(word.DEPREL) if type.find(ROOT) >= 0: res['root'] = word.LEMMA elif type.find(SUBJECT) >= 0: res['sub'] = word.LEMMA elif type.find(PREDICATE) >= 0: res['pre'] = word.LEMMA res['entity'] = [] for word in result.iterator(): if str(word.CPOSTAG).find('n') >= 0 and str( word.CPOSTAG).find('v') < 0: res['entity'].append(word.LEMMA) if res.get('sub') and str(word.HEAD.LEMMA) == str(res['sub']): res['sub_adj'] = res['sub_adj'] + [word.LEMMA] if res.get( 'sub_adj') else [word.LEMMA] else: res['pre_adj'] = res['pre_adj'] + [word.LEMMA] if res.get( 'pre_adj') else [word.LEMMA] for k in key: res[k] = res.get(k, '空') if isinstance(res[k], list): res[k] = '|'.join(res[k]) print(res) return res
def split1list(self, sentence): line = sentence.strip().decode( 'utf-8', 'ignore') # 去除每行首尾可能出现的空格,并转为Unicode进行处理 line1 = re.sub( "[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+". decode("utf8"), " ".decode("utf8"), line) #wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 wordList = HanLP.segment(line1.strip()) poslist = set() for w in wordList: length = len(w.word) nature = str(w.nature) if length < 2 and nature.__contains__('w'): continue if w.word in self.stopwords: preflag = None continue #if self.isFormWord(nature): # continue #wordpos = w.word + ' ' + nature #self.wordposlist.append(wordpos) poslist.add(w.word) return poslist
def pinyin(self, sentence): pinyinlist = HanLP.convertToPinyinList(sentence) res = [] for pinyin in pinyinlist: res.append(str(pinyin)) return ''.join(res)
def get_keywords(query, par_dict, sim_dic): _words = HanLP.segment(query) temp = [] added = [] keywords = [] visited = set() for word in _words: _word = word.word nature = str(word.nature) if _word in SAVED: temp.append(_word) elif nature in ['vn', 'vi']: temp.append(_word) elif nature == 'v' and _word in V_SET: temp.append(_word) elif nature in ['n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba' ] and _word not in FIL_SET and len(_word) > 1: temp.append(_word) for item in temp: added.append((item, 1.5)) if item in par_dict: added.append((par_dict[item], 1)) if item in sim_dic[0]: for sim in sim_dic[1][sim_dic[0][item]]: added.append((sim, 1)) for item in added: if item[0] not in visited: keywords.append(item) visited.add(item) return keywords
def get_abstract_sentence(sentence, vocabulary): ''' 句子抽象化 电影名 nm 演员名 nnt 电影类型 ng 紧跟演员名之后的演员名 nnr 评分 x ''' abstract_sentence = [] query_dict = {} second = False for segment in HanLP.segment(sentence): word = str(segment.word) nature = str(segment.nature) if nature == "nm": query_dict["nm"] = word word == "nm" elif nature == "nnt" and not second: query_dict["nnt"] = word word == "nnt" second = True elif nature == "ng": query_dict["ng"] = word word = "ng" elif nature == "m": query_dict["x"] = word word = "x" elif nature == "nnt" and second: query_dict["nnr"] = word word = "nnr" second = False if word in vocabulary: abstract_sentence.append(word) return abstract_sentence, query_dict
def __iter__(self): """make each sentence a new line""" normed_sent = preprocess(self.strings) for sent in split_iter(normed_sent, self.eos_placement): sent = ''.join(sent) if sent: yield list(term.word for term in HanLP.segment(sent))
def ws(filename, convert2zh=False): if not os.path.exists(REPO_DIR): os.makedirs(REPO_DIR) file = os.path.join(REPO_DIR, filename) fw = codecs.open(file + '.seg.sc', 'w', encoding = 'utf-8') regex = re.compile(r'[\u4e00-\u9fffa-zA-Z0-9]+') with codecs.open(file, 'r', encoding = 'utf-8') as fr: for line in fr: line = line.split('\t', 1)[1].strip().replace('“', '').replace('”', '') line = clean(line) _list = regex.findall(line.strip()) seq = '' for span in _list: result = analyzer.analyze(span) for terms in result.toSimpleWordList(): field = terms.toString().split('/') word = field[0] if not convert2zh else HanLP.convertToTraditionalChinese(field[0]) pos = field[1] seq += word.lower() + '_' + pos + ' ' seq += ',_, ' fw.write(seq.rsplit('_', 1)[0][:-1] + '。_。\n') fw.close()
def dependency_parse(self, sent, standard_name=False, stopwords=None): """ 依存句法分析,调用pyhanlp的接口,并且融入了harvesttext的实体识别机制。 不保证高准确率。 :param sent: :param standard_name: :param stopwords: :return: arcs:依存弧,列表中的列表。 [[词语id,词语字面值或实体名(standard_name控制),词性,依存关系,依存子词语id] for 每个词语] """ from pyhanlp import HanLP, JClass if not self.hanlp_prepared: self.hanlp_prepare() self.standard_name = standard_name entities_info = self.entity_linking(sent) sent2 = self.decoref(sent, entities_info) # [word.ID-1, word.LEMMA, word.POSTAG, word.DEPREL ,word.HEAD.ID-1] arcs = [] i = 0 sentence = HanLP.parseDependency(sent2) for word in sentence.iterator(): word0, tag0 = word.LEMMA, word.POSTAG if stopwords and word0 in stopwords: continue if word0 in self.entity_types: if self.standard_name: word0 = entities_info[i][1][0] # 使用链接的实体 else: l, r = entities_info[i][0] # 或使用原文 word0 = sent[l:r] tag0 = entities_info[i][1][1][1:-1] i += 1 arcs.append([word.ID-1, word0, tag0, word.DEPREL, word.HEAD.ID-1]) return arcs
def input_pipeline(sentence, lang, bpe=None): """ 1. 分词(zh) 2. 转小写(en) 3. tokenzie 4. bpe """ if lang == 'zh': seg = [term.word for term in HanLP.segment(sentence)] seg_str = ' '.join(seg) #print('分词后:', seg) mt = MosesTokenizer(lang='zh') tokenized_str = mt.tokenize(seg_str, return_str=True) #print('tokenize后;',tokenized_str) if bpe is not None: bpe_str = bpe.apply([tokenized_str])[0] #print('bpe后:', bpe_str) return bpe_str.split() return tokenized_str.split() elif lang == 'en': lower = sentence.lower() #print('小写后:'. lower) mt = MosesTokenizer(lang='en') tokenized_str = mt.tokenize(lower, return_str=True) #print('tokenize后;',tokenized_str) if bpe is not None: bpe_str = bpe.apply([tokenized_str])[0] #print('bpe后:', bpe_str) return bpe_str.split() return tokenized_str.split() else: raise Exception
def show_words(): sql = 'SELECT * FROM NEWSWB' lock.acquire() cursor.execute(sql) lock.release() news = cursor.fetchone() print(news[5], '>>>>>>>', HanLP.extractKeyword(news[5], 5))
def get_sentence_mapping(self, overload=False): """ 句子映射表 :return: vec_space,如: {'我们是中国人,我们爱自己的祖国':[......], '蜀道难,难于上青天':[......]} """ sentence_to_vec_file = current_path + '/sentence_mapping.pkl' if not os.path.isfile(sentence_to_vec_file) or overload: print('首次加载句子时间较长,请稍等......') sentence_to_vec = {} for sentence in self.sentence_list: tmp = np.zeros(shape=self.dim) index = 0 for obj in HanLP.segment(sentence): word = obj.word if word in self.char_mapping: tmp += self.char_mapping[word] else: tmp += np.zeros(shape=self.dim) index += 1 tmp /= index sentence_to_vec[sentence] = tmp with open(sentence_to_vec_file, 'wb') as f: pickle.dump(sentence_to_vec, f) f.close() else: with open(sentence_to_vec_file, 'rb') as f: sentence_to_vec = pickle.load(f) f.close() return sentence_to_vec
def add_to_dictionary(word, part, mod=0): result = CustomDictionary.add(word, part) if not result and mod: CustomDictionary.insert(word, part) text = "我用天猫交社保" print(HanLP.segment(text)) return result
def raw_seg(): """ newSegment()支持下列多种模式,默认使用viterbi 维特比 (viterbi):效率和效果的最佳平衡。也是最短路分词,HanLP最短路求解采用Viterbi算法 双数组trie树 (dat):极速词典分词,千万字符每秒(可能无法获取词性,此处取决于你的词典) 条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务 感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习 N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度 """ seg = HanLP.newSegment() for st in sentences: print(seg.seg(st)) seg_crf = HanLP.newSegment("crf") for st in sentences: print(seg_crf.seg(st)) """
def get_keyword(content,keynum=2): """ 获取每个问题中的关键字,关键词的数目由keynum控制 :param content: 一个句子 :return: """ keywordList = HanLP.extractKeyword(content,keynum) return keywordList
def segment(self, text): word_tag_list = HanLP.segment(text) word_list = [] for word_tag in word_tag_list: word, tag = str(word_tag).split('/') if tag=='n': word_list.append(word) return word_list
def convertToSimplifiedChinese(traditionalChineseString): """ * 繁转简 * * @param traditionalChineseString 繁体中文 * @return 简体中文 """ return HanLP.convertToSimplifiedChinese(traditionalChineseString)
def test_custom_dict_forcing(self): segment = HanLP.newSegment('viterbi') CustomDictionary.insert('川普', 'nr 1') self.assertIn('四川/ns, 普通人/n, 与/cc, 川/b, 普通/a, 电话/n', segment.seg('四川普通人与川普通电话').__str__()) segment.enableCustomDictionaryForcing(True) self.assertIn('四川/ns, 普通人/n, 与/cc, 川普/nr, 通电话/vi', segment.seg('四川普通人与川普通电话').__str__())
def load_data(self, file): result = [] with open(file, mode='r', encoding="utf-8") as fp: lines = fp.readlines() for line in lines: words = HanLP.segment(str(line).strip()) result.append(" ".join([str(i.word) for i in words])) return result
def hanlp_recognize(text): # segment = HanLP.newSegment().enableNameRecognize(True) # segment = HanLP.newSegment().enableTranslatedNameRecognize(True) # segment = HanLP.newSegment().enablePlaceRecognize(True) segment = HanLP.newSegment().enableOrganizationRecognize(True) term_list = segment.seg(text) print(term_list)
def segment(text): ''' 使用HanLP对中文句子进行分词 ''' try: seg_result = hanlp.segment(text) return [term.word for term in seg_result] except Exception: return text.split()
def parseDependency(sentence): """ * 依存文法分析 * * @param sentence 待分析的句子 * @return CoNLL格式的依存关系树 """ return HanLP.parseDependency(sentence)
def convertToPinyinList(text): """ * 转化为拼音 * * @param text 待解析的文本 * @return 一个拼音列表 """ return HanLP.convertToPinyinList(text)
def convertToTraditionalChinese(simplifiedChineseString): """ * 简转繁 * * @param simplifiedChineseString 简体中文 * @return 繁体中文 """ return HanLP.convertToTraditionalChinese(simplifiedChineseString)
def extractKeyword(document, size): """ * 提取关键词 * * @param document 文档内容 * @param size 希望提取几个关键词 * @return 一个列表 """ return HanLP.extractKeyword(document, size)