def split_test(self, sentence): #line = sentence.strip().decode('utf-8', 'ignore') # 去除每行首尾可能出现的空格,并转为Unicode进行处理 #line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".decode("utf8"), # " ".decode("utf8"), line) #wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 print(HanLP.segment('你好,欢迎在Python中调用HanLP的API')) for term in HanLP.segment('下雨天地面积水'): print('{}\t{}'.format(term.word, term.nature)) # 获取单词与词性 testCases = [ "商品和服务", "结婚的和尚未结婚的确实在干扰分词啊", "买水果然后来世博园最后去世博会", "中国的首都是北京", "欢迎新老师生前来就餐", "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。" ] for sentence in testCases: print(HanLP.segment(sentence)) # 关键词提取 document = "水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露," \ "根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标," \ "有部分省超过红线的指标。对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批," \ "严格地进行水资源论证和取水许可的批准。" print(HanLP.extractKeyword(document, 2)) # 自动摘要 print(HanLP.extractSummary(document, 3)) # 依存句法分析 print(HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。"))
def pos_filter(self, s): if not s: return [] wds = [w.word for w in HanLP.segment(s)] pos = [str(w.nature) for w in HanLP.segment(s) if w.nature] if len(''.join(wds)) < 2: return [] if 'n' not in pos and 'nhd' not in pos: return [] return ''.join(wds)
def make_index(): with open(ITEM_INDEX_JSON, 'w', encoding='utf8') as item_index_file, \ open(ITEM_SOURCE_JSON, 'r', encoding='utf8') as item_file: item_js = json.load(item_file) all_info = item_js['RECORDS'] for item in all_info: title = item['TITLE'] ITEM_DICT[item['ENTERPRISE_ID']]['org_id'] = item['ORG_ID'] if 'items' not in ITEM_DICT[item['ENTERPRISE_ID']]: ITEM_DICT[item['ENTERPRISE_ID']]['items'] = set() # TODO: segment and filter here. segs = HanLP.segment(title) for word in segs: _word = word.word nature = str(word.nature) if nature in ['vn', 'vi']: ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word) elif nature == 'v' and _word in V_SET: ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word) elif nature in [ 'n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba' ] and _word not in FIL_SET: ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word) for key in ITEM_DICT.keys(): ITEM_DICT[key]['items'] = list(ITEM_DICT[key]['items']) js_info = json.dumps(ITEM_DICT) item_index_file.write(js_info) with open(TYPE_INDEX_JSON, 'w', encoding='utf8') as type_index_file, \ open(TYPE_SOURCE_JSON, 'r', encoding='utf8') as type_file: type_js = json.load(type_file) all_info = type_js['RECORDS'] for item in filter(lambda x: len(x['CODE']) == 9, all_info): TYPE_DICT[item['CODE']] = set() if item['SERVICETYPEVALUE']: value_words = HanLP.segment(item['SERVICETYPEVALUE']) for word in value_words: TYPE_DICT[item['CODE']].add(word.word) if item['KEYWORD']: key_words = HanLP.segment(item['KEYWORD']) for word in key_words: TYPE_DICT[item['CODE']].add(word.word) # convert set to list for k in TYPE_DICT.keys(): TYPE_DICT[k] = list(TYPE_DICT[k]) js_info = json.dumps(TYPE_DICT) type_index_file.write(js_info)
def input_pipeline(sentence, lang, bpe=None): """ 1. 分词(zh) 2. 转小写(en) 3. tokenzie 4. bpe """ if lang == 'zh': seg = [term.word for term in HanLP.segment(sentence)] seg_str = ' '.join(seg) #print('分词后:', seg) mt = MosesTokenizer(lang='zh') tokenized_str = mt.tokenize(seg_str, return_str=True) #print('tokenize后;',tokenized_str) if bpe is not None: bpe_str = bpe.apply([tokenized_str])[0] #print('bpe后:', bpe_str) return bpe_str.split() return tokenized_str.split() elif lang == 'en': lower = sentence.lower() #print('小写后:'. lower) mt = MosesTokenizer(lang='en') tokenized_str = mt.tokenize(lower, return_str=True) #print('tokenize后;',tokenized_str) if bpe is not None: bpe_str = bpe.apply([tokenized_str])[0] #print('bpe后:', bpe_str) return bpe_str.split() return tokenized_str.split() else: raise Exception
def split1list(self, sentence): line = sentence.strip().decode( 'utf-8', 'ignore') # 去除每行首尾可能出现的空格,并转为Unicode进行处理 line1 = re.sub( "[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+". decode("utf8"), " ".decode("utf8"), line) #wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 wordList = HanLP.segment(line1.strip()) poslist = set() for w in wordList: length = len(w.word) nature = str(w.nature) if length < 2 and nature.__contains__('w'): continue if w.word in self.stopwords: preflag = None continue #if self.isFormWord(nature): # continue #wordpos = w.word + ' ' + nature #self.wordposlist.append(wordpos) poslist.add(w.word) return poslist
def add_to_dictionary(word, part, mod=0): result = CustomDictionary.add(word, part) if not result and mod: CustomDictionary.insert(word, part) text = "我用天猫交社保" print(HanLP.segment(text)) return result
def get_sentence_mapping(self, overload=False): """ 句子映射表 :return: vec_space,如: {'我们是中国人,我们爱自己的祖国':[......], '蜀道难,难于上青天':[......]} """ sentence_to_vec_file = current_path + '/sentence_mapping.pkl' if not os.path.isfile(sentence_to_vec_file) or overload: print('首次加载句子时间较长,请稍等......') sentence_to_vec = {} for sentence in self.sentence_list: tmp = np.zeros(shape=self.dim) index = 0 for obj in HanLP.segment(sentence): word = obj.word if word in self.char_mapping: tmp += self.char_mapping[word] else: tmp += np.zeros(shape=self.dim) index += 1 tmp /= index sentence_to_vec[sentence] = tmp with open(sentence_to_vec_file, 'wb') as f: pickle.dump(sentence_to_vec, f) f.close() else: with open(sentence_to_vec_file, 'rb') as f: sentence_to_vec = pickle.load(f) f.close() return sentence_to_vec
def get_keywords(query, par_dict, sim_dic): _words = HanLP.segment(query) temp = [] added = [] keywords = [] visited = set() for word in _words: _word = word.word nature = str(word.nature) if _word in SAVED: temp.append(_word) elif nature in ['vn', 'vi']: temp.append(_word) elif nature == 'v' and _word in V_SET: temp.append(_word) elif nature in ['n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba' ] and _word not in FIL_SET and len(_word) > 1: temp.append(_word) for item in temp: added.append((item, 1.5)) if item in par_dict: added.append((par_dict[item], 1)) if item in sim_dic[0]: for sim in sim_dic[1][sim_dic[0][item]]: added.append((sim, 1)) for item in added: if item[0] not in visited: keywords.append(item) visited.add(item) return keywords
def get_abstract_sentence(sentence, vocabulary): ''' 句子抽象化 电影名 nm 演员名 nnt 电影类型 ng 紧跟演员名之后的演员名 nnr 评分 x ''' abstract_sentence = [] query_dict = {} second = False for segment in HanLP.segment(sentence): word = str(segment.word) nature = str(segment.nature) if nature == "nm": query_dict["nm"] = word word == "nm" elif nature == "nnt" and not second: query_dict["nnt"] = word word == "nnt" second = True elif nature == "ng": query_dict["ng"] = word word = "ng" elif nature == "m": query_dict["x"] = word word = "x" elif nature == "nnt" and second: query_dict["nnr"] = word word = "nnr" second = False if word in vocabulary: abstract_sentence.append(word) return abstract_sentence, query_dict
def __iter__(self): """make each sentence a new line""" normed_sent = preprocess(self.strings) for sent in split_iter(normed_sent, self.eos_placement): sent = ''.join(sent) if sent: yield list(term.word for term in HanLP.segment(sent))
def segment(self, text): word_tag_list = HanLP.segment(text) word_list = [] for word_tag in word_tag_list: word, tag = str(word_tag).split('/') if tag=='n': word_list.append(word) return word_list
def load_data(self, file): result = [] with open(file, mode='r', encoding="utf-8") as fp: lines = fp.readlines() for line in lines: words = HanLP.segment(str(line).strip()) result.append(" ".join([str(i.word) for i in words])) return result
def wordSeg(text): wordPostag = HanLP.segment(text) words, postags = [], [] for line in wordPostag: line = str(line) word, postag = line.split('/') words.append(word) postags.append(postag) return words, postags
def segment(text): ''' 使用HanLP对中文句子进行分词 ''' try: seg_result = hanlp.segment(text) return [term.word for term in seg_result] except Exception: return text.split()
def remove(test_text): a = HanLP.segment(test_text) rem = dict() curs = 0 for i in a: if str(i.nature) in ['ns','nz']: rem[str(i.word)] = [(curs,curs+len(str(i.word))-1)] curs += len(str(i.word)) return rem
def word_segment(self, sentence): word_tag_list = HanLP.segment(sentence) words = [] for word_tag in word_tag_list: word_tag = str(word_tag).split('/') if len(word_tag) == 2: word, tag = word_tag if 'n' == tag and word not in self.stop_words: words.append(word) return set(words)
def generate_feature(data, word2vec): """ 生成特征向量 Args: data: 数据集 word2vec: Returns: """ features = [] end = len(data.columns) for idx, row in data.iterrows(): prefix_vec = np.zeros(DIM) title_vec = np.zeros(DIM) tag_vec = row.iloc[3: end] count = 0 try: for word in HanLP.segment(row['prefix']): word = str(word).split('/')[0] word=unicode(word, 'utf-8') try: prefix_vec += word2vec[word] count += 1 except: print('word %s not in vocab' % word) if count > 0: prefix_vec = np.true_divide(prefix_vec, count) count = 0 for word in HanLP.segment(row['title']): word = str(word).split('/')[0] word=unicode(word, 'utf-8') try: title_vec += word2vec[word] count += 1 except: print('word %s not in vocab' % word) if count > 0: title_vec = np.true_divide(title_vec, count) except Exception as e: print(e) feature = np.concatenate((prefix_vec, title_vec, tag_vec)) features.append(feature) return pd.DataFrame(features)
def get_job_address(self, source): soup = BeautifulSoup( source.text.encode('iso-8859-1').decode('gbk'), 'lxml') address = soup.find('p', { 'class': 'msg ltype' }).text.split('\xa0\xa0|\xa0\xa0')[0] if 'ns' in HanLP.segment(address).toString(): return address else: return ''
def delete_stop_words(item, stop_words): result_word = "" words = HanLP.segment(item) for word in words: word = str(word).split('/')[0] if (word in stop_words): continue else: result_word += word return result_word
def __iter__(self): for sentence in get_sentence(self.fname): seg_list = HanLP.segment(sentence) for i, word in enumerate(seg_list): seg_list[i] = str(word).split('/')[0] # print str(seg_list) # seg_list = char_list_cheaner(seg_list) if seg_list: yield seg_list
def get_sentence_vector(self, sentence): words = [item.word for item in HanLP.segment(sentence)] cnt = 0 vec_fin = np.zeros(self.wv.vector_size) for w in words: if w in self.wv: vec_fin += self.get_word_vector(w) cnt += 1 if cnt > 0: vec_fin = vec_fin / cnt return vec_fin
def replace_samePinyin(content, same_pinyin, word_freq_vocab, replace_num=1): """ 使用同音字替换content中关键词中,(替换规则为替换掉所有同音字出现频率最高的那个字) :param content: 要替换的文本 :param same_pinyin: 相同拼音词汇表 :param word_freq_vocab: 汉语字频率表 :param replace_num: 要替换的数量,这个版本目前只考虑一个content中只替换一个字 :return: 经过相同拼音替换掉的文本 """ segmentationList = HanLP.segment(content) word_list_of_content = list(content) # print(len(segmentationList)) if len(set(segmentationList)) <= 2: keynum = 1 elif len(segmentationList) > 2 and len(set(segmentationList)) <= 6: keynum = 2 else: # keynum = int(len(set(segmentationList))/3) keynum = 4 keywordList = get_keyword(content, keynum) # 获取关键词 key_character = [] for word in keywordList: # 提取关键词里的关键字 key_character += list(word) key_character = list(set(key_character)) # 去掉重复的关键字 key_character = [word for word in key_character if word in same_pinyin ] # 先检查关键词中的所有字是否都出现在same_pinyin词汇表中 word_freq = [] for i in key_character: # 统计关键字的频率 samePinyin_list = same_pinyin[i] # 获取相同拼音的所有字 samePinyin_freq = [] for j in samePinyin_list: if j in word_freq_vocab: samePinyin_freq.append(word_freq_vocab[j]) else: samePinyin_freq.append(1) word_freq.append(samePinyin_list[samePinyin_freq.index( max(samePinyin_freq))]) freq = [] if len(word_freq) != 0: for i in word_freq: if i in word_freq_vocab: freq.append(word_freq_vocab[i]) else: freq.append(1) same_pinyin_HighFreq_word = word_freq[freq.index(max(freq))] replace_word = key_character[freq.index(max(freq))] replace_index = word_list_of_content.index(replace_word) word_list_of_content[replace_index] = same_pinyin_HighFreq_word new_content = "".join(word_list_of_content) # print("smae_pinyin",same_pinyin["火"]) return new_content else: return content
def split(self, sentence): line = sentence.strip().decode( 'utf-8', 'ignore') # 去除每行首尾可能出现的空格,并转为Unicode进行处理 line1 = re.sub( "[0-9\s+\.\!\/_,$%^*()?;;::“”-【】+\"\']+|[+——!,;::“”。?、~@#¥%……&*()]+" .decode("utf8"), " ".decode("utf8"), line) #wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 wordList = HanLP.segment(line1) # self.process(wordList) return self.wordnetlist
def hanlp_cut(self): lines = [] df = pd.read_excel(self.excel_path) sentences = df['sentences'] for sentence in sentences: if sentence is not np.nan: # print(sentence) cuts = HanLP.segment(sentence) lines.append(' '.join(cut.word for cut in cuts)) # print(lines) self.save_to_excel(lines)
def extract_locations(text): """ extract locations by from texts eg: extract_locations('我家住在陕西省安康市汉滨区。') :param: raw_text<string> :return: location_list<list> eg: ['陕西省安康市汉滨区', '安康市汉滨区', '汉滨区'] """ if text == '': return [] seg_list = [(str(t.word), str(t.nature)) for t in HanLP.segment(text)] location_list = get_location(seg_list) return location_list
def prefix_cut_in_title(item): """ cut the prefix and title. And matching them. """ prefix = item["prefix"] title = item["title"] til_list = [] words_til = HanLP.segment(title) for word_til in words_til: word_til = str(word_til).split('/')[0] til_list.append(word_til) words_pre = HanLP.segment(prefix) for word_pre in words_pre: word_pre = str(word_pre).split('/')[0] if (word_pre in til_list): continue else: return 0 return 1
def tokenize(self, text): # type: (Text) -> List[Token] from pyhanlp import HanLP terms = HanLP.segment(text) running_offset = 0 tokens = [] for term in terms: word_offset = text.index(term.word, running_offset) word_len = len(term.word) running_offset = word_offset + word_len tokens.append(Token(term.word, word_offset)) logging.debug(terms) return tokens
def title_cut_word(item,stop_words): click=[] words=HanLP.segment(item) for word in words: word=str(word).split('/')[0] if((word in stop_words) or (word=='') or (word not in title_count_dict)): continue else: click.append(unicode(word, 'utf-8')) if(len(click)==0): return 0.0 else: return str(click)
def title_cut_maxclick(item,stop_words): click=[] words=HanLP.segment(item) for word in words: word=str(word).split('/')[0] if((word in stop_words) or (word=='') or (word not in title_count_dict)): continue else: click.append(title_click_dict[word]) if(len(click)==0): return 0.0 else: return np.max(click)
def seg_with_han176(in_file, out_file_path, manual_seg_file): # save seg_result corpus = construct_corpus(in_file) f = open(out_file_path, "w", encoding='utf-8') for line in corpus: result_h176 = "=".join("%s" % t.word for t in HanLP.segment(line)) # 每个text是一句话 f.write(result_h176 + "\n") f.flush() # test qps corpus = construct_corpus(in_file, 500) start = time.time() for line in corpus: _ = HanLP.segment(line) end = time.time() qps = round(len(corpus) / (end - start), 2) # test accuracy p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file) return qps, p, r, f1, line_aver_length