Exemple #1
0
    def wrapper(self, *args, **kargs):
        # 按索引检索的 appkey
        if self.appkey_obj_list is not None:
            count = 0
            while count <= self.appkey_num:
                self.appkey_obj = self.appkey_obj_list[self.appkey_index]

                count += 1
                try:
                    f = func(self, *args, **kargs)
                    break

                except Exception as err:

                    # 替换密钥的索引
                    if self.appkey_index == self.appkey_num - 1:
                        self.appkey_index = 0
                    else:
                        self.appkey_index += 1

                    # 统计,若循环次数大于密钥个数,即全部密钥被尝试,则退出;否则继续尝试下一个密钥
                    if count < self.appkey_num:
                        logging.warning(
                            'The appkey {} of `{}` is invalid.'.format(
                                json.dumps(self.appkey_obj, ensure_ascii=False),
                                self.__class__.__name__))
                    else:
                        logging.error(err)
                        raise Exception(err)
                        break

        else:
            f = func(self, *args, **kargs)

        return f
Exemple #2
0
    def __call__(self, id_card):
        if self.china_locations is None:
            self._prepare()

        # 检查是否是身份证号
        match_flag = self.id_card_check_pattern.match(id_card)

        if match_flag is None:
            logging.error('the id card is wrong.')
            return None

        if id_card[:6] in self.china_locations.keys():
            prov, city, county = self.china_locations[id_card[:6]]
        elif id_card[:4] + '0' * 2 in self.china_locations.keys():
            prov, city, county = self.china_locations[id_card[:4] + '0' * 2]
        elif id_card[:2] + '0' * 4 in self.china_locations.keys():
            prov, city, county = self.china_locations[id_card[:2] + '0' * 4]
        else:
            # 前六位行政区划全错
            logging.error('the administration code of id card is wrong.')
            return None
        gender = '男' if int(id_card[-2]) % 2 else '女'
        check_code = id_card[-1]
        if check_code == 'X':
            check_code = 'x'

        return {
            'province': prov,
            'city': city,
            'county': county,
            'birth_year': id_card[6:10],
            'birth_month': id_card[10:12],
            'birth_day': id_card[12:14],
            'gender': gender,
            'check_code': check_code
        }
Exemple #3
0
    def __call__(self,
                 text,
                 summary_length=200,
                 lead_3_weight=1.2,
                 topic_theta=0.2,
                 allow_topic_weight=True):

        # 输入检查
        if type(text) is not str:
            raise ValueError('type of `text` should only be str')
        try:
            # 初始化加载
            if self.unk_topic_prominence_value == 0.:
                self._prepare()

            if lead_3_weight < 1:
                raise ValueError(
                    'the params `lead_3_weight` should not be less than 1.0')
            if len(text) <= summary_length:
                return text

            # step 0: 清洗文本
            text = clean_text(text)

            # step 1: 分句,并逐句清理杂质
            sentences_list = split_sentence(text)

            # step 2: 分词与词性标注
            sentences_segs_dict = dict()
            counter_segs_list = list()
            for idx, sen in enumerate(sentences_list):
                if not check_chinese_char(sen):  # 若无中文字符,则略过
                    continue

                sen_segs = self.seg.cut(sen)
                sentences_segs_dict.update({sen: [idx, sen_segs, list(), 0]})
                counter_segs_list.extend(sen_segs)

            # step 3: 计算词频
            total_length = len(counter_segs_list)
            freq_dict = dict()
            for word_pos in counter_segs_list:
                word, pos = word_pos
                if word in freq_dict:
                    freq_dict[word][1] += 1
                else:
                    freq_dict.update({word: [pos, 1]})

            # step 4: 计算每一个词的权重
            for sen, sen_segs in sentences_segs_dict.items():
                sen_segs_weights = list()
                for word_pos in sen_segs[1]:
                    word, pos = word_pos
                    if pos not in self.pos_name and word in self.stop_words:  # 虚词权重为 0
                        weight = 0.0
                    else:
                        weight = freq_dict[word][1] * self.idf_dict.get(
                            word, self.median_idf) / total_length
                    sen_segs_weights.append(weight)

                sen_segs[2] = sen_segs_weights
                sen_segs[3] = len([w for w in sen_segs_weights if w != 0]) / len(sen_segs_weights) \
                    if len(sen_segs_weights) == 0 else 0

            # step 5: 得到每个句子的权重
            for sen, sen_segs in sentences_segs_dict.items():
                # tfidf 权重
                tfidf_weight = sum(sen_segs[2]) / len(sen_segs[2])

                # 主题模型权重
                if allow_topic_weight:
                    topic_weight = 0.0
                    for item in sen_segs[1]:
                        topic_weight += self.topic_prominence_dict.get(
                            item[0], self.unk_topic_prominence_value)
                    topic_weight = topic_weight / len(sen_segs[1])
                else:
                    topic_weight = 0.0

                sen_weight = topic_weight * topic_theta + tfidf_weight

                # 句子长度超过限制,权重削减
                if len(sen) < 15 or len(sen) > 70:
                    sen_weight = 0.7 * sen_weight

                # LEAD-3 权重
                if sen_segs[0] < 3:
                    sen_weight *= lead_3_weight

                sen_segs[3] = sen_weight

            # step 6: 按照 MMR 算法重新计算权重,并把不想要的过滤掉
            sentences_info_list = sorted(sentences_segs_dict.items(),
                                         key=lambda item: item[1][3],
                                         reverse=True)

            mmr_list = list()
            for sentence_info in sentences_info_list:
                # 计算与已有句子的相似度
                sim_ratio = self._mmr_similarity(sentence_info, mmr_list)
                sentence_info[1][3] = (1 - sim_ratio) * sentence_info[1][3]
                mmr_list.append(sentence_info)

            # step 7: 按重要程度进行排序,选取若干个句子作为摘要
            if len(sentences_info_list) == 1:
                return sentences_info_list[0][0]
            total_length = 0
            summary_list = list()
            for idx, item in enumerate(sentences_info_list):
                if len(item[0]) + total_length > summary_length:
                    if idx == 0:
                        return item[0]
                    else:
                        # 按序号排序
                        summary_list = sorted(summary_list,
                                              key=lambda item: item[1][0])
                        summary = ''.join([item[0] for item in summary_list])
                        return summary
                else:
                    summary_list.append(item)
                    total_length += len(item[0])
                    if idx == len(sentences_info_list) - 1:
                        summary_list = sorted(summary_list,
                                              key=lambda item: item[1][0])
                        summary = ''.join([item[0] for item in summary_list])
                        return summary

            return text[:summary_length]
        except Exception as e:
            logging.error('the text is illegal. \n{}'.format(e))
            return ''
    def __call__(self,
                 text,
                 top_k=5,
                 with_weight=False,
                 func_word_num=1,
                 stop_word_num=0,
                 max_phrase_len=25,
                 topic_theta=0.5,
                 allow_pos_weight=True,
                 strict_pos=True,
                 allow_length_weight=True,
                 allow_topic_weight=True,
                 without_person_name=False,
                 without_location_name=False,
                 remove_phrases_list=None,
                 remove_words_list=None,
                 specified_words=dict(),
                 bias=None):
        try:
            # 初始化加载
            if self.unk_topic_prominence_value == 0.:
                self._prepare()

            # 配置参数
            if without_location_name:
                if 'ns' in self.strict_pos_name:
                    self.strict_pos_name.remove('ns')
                if 'ns' in self.pos_name:
                    self.pos_name.remove('ns')
            else:
                if 'ns' not in self.strict_pos_name:
                    self.strict_pos_name.append('ns')
                if 'ns' not in self.pos_name:
                    self.pos_name.append('ns')

            if without_person_name:
                if 'nr' in self.strict_pos_name:
                    self.strict_pos_name.remove('nr')
                if 'nr' in self.pos_name:
                    self.pos_name.remove('nr')
            else:
                if 'nr' not in self.strict_pos_name:
                    self.strict_pos_name.append('nr')
                if 'nr' not in self.pos_name:
                    self.pos_name.append('nr')

            # step0: 清洗文本,去除杂质
            text = clean_text(text)

            # step1: 分句,使用北大的分词器 pkuseg 做分词和词性标注
            sentences_list = split_sentence(text, criterion='fine')

            sentences_segs_list = list()
            counter_segs_list = list()
            for sen in sentences_list:
                sen_segs = self.seg.cut(sen)
                sentences_segs_list.append(sen_segs)
                counter_segs_list.extend(sen_segs)

            # step2: 计算词频
            total_length = len(counter_segs_list)
            freq_dict = dict()
            for word_pos in counter_segs_list:
                word, pos = word_pos
                if word in freq_dict:
                    freq_dict[word][1] += 1
                else:
                    freq_dict.update({word: [pos, 1]})

            # step3: 计算每一个词的权重,tfidf 方式
            sentences_segs_weights_list = list()
            for sen, sen_segs in zip(sentences_list, sentences_segs_list):
                sen_segs_weights = list()
                for word_pos in sen_segs:
                    word, pos = word_pos
                    if pos in self.pos_name:  # 虚词权重为 0
                        if word in self.stop_words:  # 停用词权重为 0
                            weight = 0.0
                        else:
                            if word in specified_words:  # 为词计算权重
                                if bias is None:
                                    weight = freq_dict[word][
                                        1] * self.idf_dict.get(
                                            word, self.median_idf
                                        ) / total_length + 1 / specified_words[
                                            word]
                                else:
                                    weight = freq_dict[word][
                                        1] * self.idf_dict.get(
                                            word, self.median_idf
                                        ) / total_length + bias
                            else:
                                weight = freq_dict[word][1] * self.idf_dict.get(
                                    word, self.median_idf) / total_length
                    else:
                        weight = 0.0
                    sen_segs_weights.append(weight)
                sentences_segs_weights_list.append(sen_segs_weights)

            # pdb.set_trace()
            # step4: 通过一定规则,找到候选短语集合,以及其权重
            candidate_phrases_dict = dict()
            for sen_segs, sen_segs_weights in zip(sentences_segs_list,
                                                  sentences_segs_weights_list):
                sen_length = len(sen_segs)

                for n in range(1, sen_length + 1):  # n-grams
                    for i in range(0, sen_length - n + 1):
                        candidate_phrase = sen_segs[i:i + n]
                        # print(candidate_phrase)
                        # pdb.set_trace()

                        # 由于 pkuseg 的缺陷,日期被识别为 n 而非 t,故删除日期
                        res = self.extra_date_ptn.match(
                            candidate_phrase[-1][0])
                        if res is not None:
                            continue

                        # 找短语过程中需要进行过滤,分为严格、宽松规则
                        if not strict_pos:
                            rule_flag = self._loose_candidate_phrases_rules(
                                candidate_phrase,
                                func_word_num=func_word_num,
                                max_phrase_len=max_phrase_len,
                                stop_word_num=stop_word_num)
                        else:
                            rule_flag = self._strict_candidate_phrases_rules(
                                candidate_phrase,
                                max_phrase_len=max_phrase_len)
                        if not rule_flag:
                            continue

                        # 由于 pkuseg 的缺陷,会把一些杂质符号识别为 n、v、adj,故须删除
                        redundant_flag = False
                        for item in candidate_phrase:
                            matched = self.redundant_strict_pattern.search(
                                item[0])
                            if matched is not None:
                                redundant_flag = True
                                break
                            matched = self.redundant_loose_pattern.search(
                                item[0])

                            if matched is not None and matched.group(
                            ) == item[0]:
                                redundant_flag = True
                                break
                        if redundant_flag:
                            continue

                        # 如果短语中包含了某些不想要的词,则跳过
                        if remove_words_list is not None:
                            unwanted_phrase_flag = False
                            for item in candidate_phrase:
                                if item[0] in remove_words_list:
                                    unwanted_phrase_flag = True
                                    break
                            if unwanted_phrase_flag:
                                continue

                        # 如果短语中没有一个 token 存在于指定词汇中,则跳过
                        if specified_words != dict():
                            with_specified_words_flag = False
                            for item in candidate_phrase:
                                if item[0] in specified_words:
                                    with_specified_words_flag = True
                                    break
                            if not with_specified_words_flag:
                                continue

                        # 条件六:短语的权重需要乘上'词性权重'
                        if allow_pos_weight:
                            start_end_pos = None
                            if len(candidate_phrase) == 1:
                                start_end_pos = candidate_phrase[0][1]
                            elif len(candidate_phrase) >= 2:
                                start_end_pos = candidate_phrase[0][
                                    1] + '|' + candidate_phrase[-1][1]
                            pos_weight = self.pos_combine_weights_dict.get(
                                start_end_pos, 1.0)
                        else:
                            pos_weight = 1.0

                        # 条件七:短语的权重需要乘上 '长度权重'
                        if allow_length_weight:
                            length_weight = self.phrases_length_control_dict.get(
                                len(sen_segs_weights[i:i + n]),
                                self.phrases_length_control_none)
                        else:
                            length_weight = 1.0

                        # 条件八:短语的权重需要加上`主题突出度权重`
                        if allow_topic_weight:
                            topic_weight = 0.0
                            for item in candidate_phrase:
                                topic_weight += self.topic_prominence_dict.get(
                                    item[0], self.unk_topic_prominence_value)
                            topic_weight = topic_weight / len(candidate_phrase)
                        else:
                            topic_weight = 0.0

                        candidate_phrase_weight = sum(sen_segs_weights[i:i +
                                                                       n])
                        candidate_phrase_weight *= length_weight * pos_weight
                        candidate_phrase_weight += topic_weight * topic_theta

                        candidate_phrase_string = ''.join(
                            [tup[0] for tup in candidate_phrase])
                        if remove_phrases_list is not None:
                            if candidate_phrase_string in remove_phrases_list:
                                continue
                        if candidate_phrase_string not in candidate_phrases_dict:
                            candidate_phrases_dict.update({
                                candidate_phrase_string:
                                [candidate_phrase, candidate_phrase_weight]
                            })

            # step5: 将 overlapping 过量的短语进行去重过滤
            # 尝试了依据权重高低,将较短的短语替代重复了的较长的短语,但效果不好,故删去
            candidate_phrases_list = sorted(candidate_phrases_dict.items(),
                                            key=lambda item: len(item[1][0]),
                                            reverse=True)

            de_duplication_candidate_phrases_list = list()
            for item in candidate_phrases_list:
                sim_ratio = self._mmr_similarity(
                    item, de_duplication_candidate_phrases_list)
                if sim_ratio != 1:
                    item[1][1] = (1 - sim_ratio) * item[1][1]
                    de_duplication_candidate_phrases_list.append(item)

            # step6: 按重要程度进行排序,选取 top_k 个
            candidate_phrases_list = sorted(
                de_duplication_candidate_phrases_list,
                key=lambda item: item[1][1],
                reverse=True)

            if with_weight:
                if top_k != -1:
                    final_res = [(item[0], item[1][1])
                                 for item in candidate_phrases_list[:top_k]
                                 if item[1][1] > 0]
                else:
                    final_res = [(item[0], item[1][1])
                                 for item in candidate_phrases_list
                                 if item[1][1] > 0]
            else:
                if top_k != -1:
                    final_res = [
                        item[0] for item in candidate_phrases_list[:top_k]
                        if item[1][1] > 0
                    ]
                else:
                    final_res = [
                        item[0] for item in candidate_phrases_list
                        if item[1][1] > 0
                    ]
            return final_res

        except Exception as e:
            logging.error('the text is illegal. \n{}'.format(e))
            return list()