def wrapper(self, *args, **kargs): # 按索引检索的 appkey if self.appkey_obj_list is not None: count = 0 while count <= self.appkey_num: self.appkey_obj = self.appkey_obj_list[self.appkey_index] count += 1 try: f = func(self, *args, **kargs) break except Exception as err: # 替换密钥的索引 if self.appkey_index == self.appkey_num - 1: self.appkey_index = 0 else: self.appkey_index += 1 # 统计,若循环次数大于密钥个数,即全部密钥被尝试,则退出;否则继续尝试下一个密钥 if count < self.appkey_num: logging.warning( 'The appkey {} of `{}` is invalid.'.format( json.dumps(self.appkey_obj, ensure_ascii=False), self.__class__.__name__)) else: logging.error(err) raise Exception(err) break else: f = func(self, *args, **kargs) return f
def __call__(self, id_card): if self.china_locations is None: self._prepare() # 检查是否是身份证号 match_flag = self.id_card_check_pattern.match(id_card) if match_flag is None: logging.error('the id card is wrong.') return None if id_card[:6] in self.china_locations.keys(): prov, city, county = self.china_locations[id_card[:6]] elif id_card[:4] + '0' * 2 in self.china_locations.keys(): prov, city, county = self.china_locations[id_card[:4] + '0' * 2] elif id_card[:2] + '0' * 4 in self.china_locations.keys(): prov, city, county = self.china_locations[id_card[:2] + '0' * 4] else: # 前六位行政区划全错 logging.error('the administration code of id card is wrong.') return None gender = '男' if int(id_card[-2]) % 2 else '女' check_code = id_card[-1] if check_code == 'X': check_code = 'x' return { 'province': prov, 'city': city, 'county': county, 'birth_year': id_card[6:10], 'birth_month': id_card[10:12], 'birth_day': id_card[12:14], 'gender': gender, 'check_code': check_code }
def __call__(self, text, summary_length=200, lead_3_weight=1.2, topic_theta=0.2, allow_topic_weight=True): # 输入检查 if type(text) is not str: raise ValueError('type of `text` should only be str') try: # 初始化加载 if self.unk_topic_prominence_value == 0.: self._prepare() if lead_3_weight < 1: raise ValueError( 'the params `lead_3_weight` should not be less than 1.0') if len(text) <= summary_length: return text # step 0: 清洗文本 text = clean_text(text) # step 1: 分句,并逐句清理杂质 sentences_list = split_sentence(text) # step 2: 分词与词性标注 sentences_segs_dict = dict() counter_segs_list = list() for idx, sen in enumerate(sentences_list): if not check_chinese_char(sen): # 若无中文字符,则略过 continue sen_segs = self.seg.cut(sen) sentences_segs_dict.update({sen: [idx, sen_segs, list(), 0]}) counter_segs_list.extend(sen_segs) # step 3: 计算词频 total_length = len(counter_segs_list) freq_dict = dict() for word_pos in counter_segs_list: word, pos = word_pos if word in freq_dict: freq_dict[word][1] += 1 else: freq_dict.update({word: [pos, 1]}) # step 4: 计算每一个词的权重 for sen, sen_segs in sentences_segs_dict.items(): sen_segs_weights = list() for word_pos in sen_segs[1]: word, pos = word_pos if pos not in self.pos_name and word in self.stop_words: # 虚词权重为 0 weight = 0.0 else: weight = freq_dict[word][1] * self.idf_dict.get( word, self.median_idf) / total_length sen_segs_weights.append(weight) sen_segs[2] = sen_segs_weights sen_segs[3] = len([w for w in sen_segs_weights if w != 0]) / len(sen_segs_weights) \ if len(sen_segs_weights) == 0 else 0 # step 5: 得到每个句子的权重 for sen, sen_segs in sentences_segs_dict.items(): # tfidf 权重 tfidf_weight = sum(sen_segs[2]) / len(sen_segs[2]) # 主题模型权重 if allow_topic_weight: topic_weight = 0.0 for item in sen_segs[1]: topic_weight += self.topic_prominence_dict.get( item[0], self.unk_topic_prominence_value) topic_weight = topic_weight / len(sen_segs[1]) else: topic_weight = 0.0 sen_weight = topic_weight * topic_theta + tfidf_weight # 句子长度超过限制,权重削减 if len(sen) < 15 or len(sen) > 70: sen_weight = 0.7 * sen_weight # LEAD-3 权重 if sen_segs[0] < 3: sen_weight *= lead_3_weight sen_segs[3] = sen_weight # step 6: 按照 MMR 算法重新计算权重,并把不想要的过滤掉 sentences_info_list = sorted(sentences_segs_dict.items(), key=lambda item: item[1][3], reverse=True) mmr_list = list() for sentence_info in sentences_info_list: # 计算与已有句子的相似度 sim_ratio = self._mmr_similarity(sentence_info, mmr_list) sentence_info[1][3] = (1 - sim_ratio) * sentence_info[1][3] mmr_list.append(sentence_info) # step 7: 按重要程度进行排序,选取若干个句子作为摘要 if len(sentences_info_list) == 1: return sentences_info_list[0][0] total_length = 0 summary_list = list() for idx, item in enumerate(sentences_info_list): if len(item[0]) + total_length > summary_length: if idx == 0: return item[0] else: # 按序号排序 summary_list = sorted(summary_list, key=lambda item: item[1][0]) summary = ''.join([item[0] for item in summary_list]) return summary else: summary_list.append(item) total_length += len(item[0]) if idx == len(sentences_info_list) - 1: summary_list = sorted(summary_list, key=lambda item: item[1][0]) summary = ''.join([item[0] for item in summary_list]) return summary return text[:summary_length] except Exception as e: logging.error('the text is illegal. \n{}'.format(e)) return ''
def __call__(self, text, top_k=5, with_weight=False, func_word_num=1, stop_word_num=0, max_phrase_len=25, topic_theta=0.5, allow_pos_weight=True, strict_pos=True, allow_length_weight=True, allow_topic_weight=True, without_person_name=False, without_location_name=False, remove_phrases_list=None, remove_words_list=None, specified_words=dict(), bias=None): try: # 初始化加载 if self.unk_topic_prominence_value == 0.: self._prepare() # 配置参数 if without_location_name: if 'ns' in self.strict_pos_name: self.strict_pos_name.remove('ns') if 'ns' in self.pos_name: self.pos_name.remove('ns') else: if 'ns' not in self.strict_pos_name: self.strict_pos_name.append('ns') if 'ns' not in self.pos_name: self.pos_name.append('ns') if without_person_name: if 'nr' in self.strict_pos_name: self.strict_pos_name.remove('nr') if 'nr' in self.pos_name: self.pos_name.remove('nr') else: if 'nr' not in self.strict_pos_name: self.strict_pos_name.append('nr') if 'nr' not in self.pos_name: self.pos_name.append('nr') # step0: 清洗文本,去除杂质 text = clean_text(text) # step1: 分句,使用北大的分词器 pkuseg 做分词和词性标注 sentences_list = split_sentence(text, criterion='fine') sentences_segs_list = list() counter_segs_list = list() for sen in sentences_list: sen_segs = self.seg.cut(sen) sentences_segs_list.append(sen_segs) counter_segs_list.extend(sen_segs) # step2: 计算词频 total_length = len(counter_segs_list) freq_dict = dict() for word_pos in counter_segs_list: word, pos = word_pos if word in freq_dict: freq_dict[word][1] += 1 else: freq_dict.update({word: [pos, 1]}) # step3: 计算每一个词的权重,tfidf 方式 sentences_segs_weights_list = list() for sen, sen_segs in zip(sentences_list, sentences_segs_list): sen_segs_weights = list() for word_pos in sen_segs: word, pos = word_pos if pos in self.pos_name: # 虚词权重为 0 if word in self.stop_words: # 停用词权重为 0 weight = 0.0 else: if word in specified_words: # 为词计算权重 if bias is None: weight = freq_dict[word][ 1] * self.idf_dict.get( word, self.median_idf ) / total_length + 1 / specified_words[ word] else: weight = freq_dict[word][ 1] * self.idf_dict.get( word, self.median_idf ) / total_length + bias else: weight = freq_dict[word][1] * self.idf_dict.get( word, self.median_idf) / total_length else: weight = 0.0 sen_segs_weights.append(weight) sentences_segs_weights_list.append(sen_segs_weights) # pdb.set_trace() # step4: 通过一定规则,找到候选短语集合,以及其权重 candidate_phrases_dict = dict() for sen_segs, sen_segs_weights in zip(sentences_segs_list, sentences_segs_weights_list): sen_length = len(sen_segs) for n in range(1, sen_length + 1): # n-grams for i in range(0, sen_length - n + 1): candidate_phrase = sen_segs[i:i + n] # print(candidate_phrase) # pdb.set_trace() # 由于 pkuseg 的缺陷,日期被识别为 n 而非 t,故删除日期 res = self.extra_date_ptn.match( candidate_phrase[-1][0]) if res is not None: continue # 找短语过程中需要进行过滤,分为严格、宽松规则 if not strict_pos: rule_flag = self._loose_candidate_phrases_rules( candidate_phrase, func_word_num=func_word_num, max_phrase_len=max_phrase_len, stop_word_num=stop_word_num) else: rule_flag = self._strict_candidate_phrases_rules( candidate_phrase, max_phrase_len=max_phrase_len) if not rule_flag: continue # 由于 pkuseg 的缺陷,会把一些杂质符号识别为 n、v、adj,故须删除 redundant_flag = False for item in candidate_phrase: matched = self.redundant_strict_pattern.search( item[0]) if matched is not None: redundant_flag = True break matched = self.redundant_loose_pattern.search( item[0]) if matched is not None and matched.group( ) == item[0]: redundant_flag = True break if redundant_flag: continue # 如果短语中包含了某些不想要的词,则跳过 if remove_words_list is not None: unwanted_phrase_flag = False for item in candidate_phrase: if item[0] in remove_words_list: unwanted_phrase_flag = True break if unwanted_phrase_flag: continue # 如果短语中没有一个 token 存在于指定词汇中,则跳过 if specified_words != dict(): with_specified_words_flag = False for item in candidate_phrase: if item[0] in specified_words: with_specified_words_flag = True break if not with_specified_words_flag: continue # 条件六:短语的权重需要乘上'词性权重' if allow_pos_weight: start_end_pos = None if len(candidate_phrase) == 1: start_end_pos = candidate_phrase[0][1] elif len(candidate_phrase) >= 2: start_end_pos = candidate_phrase[0][ 1] + '|' + candidate_phrase[-1][1] pos_weight = self.pos_combine_weights_dict.get( start_end_pos, 1.0) else: pos_weight = 1.0 # 条件七:短语的权重需要乘上 '长度权重' if allow_length_weight: length_weight = self.phrases_length_control_dict.get( len(sen_segs_weights[i:i + n]), self.phrases_length_control_none) else: length_weight = 1.0 # 条件八:短语的权重需要加上`主题突出度权重` if allow_topic_weight: topic_weight = 0.0 for item in candidate_phrase: topic_weight += self.topic_prominence_dict.get( item[0], self.unk_topic_prominence_value) topic_weight = topic_weight / len(candidate_phrase) else: topic_weight = 0.0 candidate_phrase_weight = sum(sen_segs_weights[i:i + n]) candidate_phrase_weight *= length_weight * pos_weight candidate_phrase_weight += topic_weight * topic_theta candidate_phrase_string = ''.join( [tup[0] for tup in candidate_phrase]) if remove_phrases_list is not None: if candidate_phrase_string in remove_phrases_list: continue if candidate_phrase_string not in candidate_phrases_dict: candidate_phrases_dict.update({ candidate_phrase_string: [candidate_phrase, candidate_phrase_weight] }) # step5: 将 overlapping 过量的短语进行去重过滤 # 尝试了依据权重高低,将较短的短语替代重复了的较长的短语,但效果不好,故删去 candidate_phrases_list = sorted(candidate_phrases_dict.items(), key=lambda item: len(item[1][0]), reverse=True) de_duplication_candidate_phrases_list = list() for item in candidate_phrases_list: sim_ratio = self._mmr_similarity( item, de_duplication_candidate_phrases_list) if sim_ratio != 1: item[1][1] = (1 - sim_ratio) * item[1][1] de_duplication_candidate_phrases_list.append(item) # step6: 按重要程度进行排序,选取 top_k 个 candidate_phrases_list = sorted( de_duplication_candidate_phrases_list, key=lambda item: item[1][1], reverse=True) if with_weight: if top_k != -1: final_res = [(item[0], item[1][1]) for item in candidate_phrases_list[:top_k] if item[1][1] > 0] else: final_res = [(item[0], item[1][1]) for item in candidate_phrases_list if item[1][1] > 0] else: if top_k != -1: final_res = [ item[0] for item in candidate_phrases_list[:top_k] if item[1][1] > 0 ] else: final_res = [ item[0] for item in candidate_phrases_list if item[1][1] > 0 ] return final_res except Exception as e: logging.error('the text is illegal. \n{}'.format(e)) return list()