Example #1
0
    def rule_filter(self, name_position_data, text):
        '''
        规则过滤
        :return:
        '''
        result = {}  # {'name1':[{'prob':1, 'type':'3', 'name':'一代宗师'}],  'name2':[{}]}
        model_predict_words = {}  # 保存的是需要过模型的数据
        try:
            # 获取经过反向规则判断得到的剩下的数据
            names_list = name_position_data.keys()
            contain_name = remove_ip_rule(self.ip_words, name_position_data, text, names_list)
            for ip_word in contain_name:  # 遍历每一个ip词
                ip_name = self.ip_words[ip_word][0]['ip_name']  # 存储对应ip_name中的ip_name
                if ip_name in result:  # 如果此条ip_name 已经存在结果集里面了
                    continue
                # 如果是《变形记》直接过模型
                if ip_word == '变形记':
                    model_predict_words[ip_word] = name_position_data[ip_word]  # 添加到需要过模型的数据中、
                    continue
                # 根据basic_tools中的规则进行判断
                basic_judge_result = predicate_ip_rule(self.ip_words, ip_word, text, name_position_data,
                                                       contain_name, quotes=True, three_related=True)
                if basic_judge_result:
                    result[ip_name] = load_result(prob=1, ip_type=ip_types[self.ip_type], name=ip_word)
                    continue
                # 走完上面的规则,发现是噪音,type_noise_ip == 0 的数据,过模型
                if not self.ip_words[ip_word][0]['type_noise_ip']:
                    model_predict_words[ip_word] = name_position_data[ip_word]  # 只是一个复制过程,格式完全相同
        except:
            error = "xiaoshuo_rule_filter error. traceback: %s" % traceback.format_exc()
            self.err_logger.error(error)

        return result, model_predict_words
Example #2
0
    def rule_filter(self, name_position_data, text):
        '''
        赛事规则过滤
        '''
        result = {
        }  # {'name1':[{'prob':1, 'type':'3', 'name':'一代宗师'}],  'name2':[{}]}
        model_predict_words = {}  # 保存的是需要过模型的数据
        try:
            # 获取经过反向规则判断得到的剩下的数据
            names_list = name_position_data.keys()
            contain_name = remove_ip_rule(self.ip_words, name_position_data,
                                          text, names_list)

            for ip_word in contain_name:  # 遍历每一个ip词
                # 判断是否ip词是f1,如是放到过模型的数据中
                if ip_word == 'f1':
                    model_predict_words[ip_word] = name_position_data[
                        ip_word]  # 只是一个复制过程,格式完全相同
                    continue
                ip_name = self.ip_words[ip_word][0][
                    'ip_name']  # 存储对应ip_name中的ip_name
                # 规则一:判断是否是噪音IP   'noise_ip' :0或1,整形
                if ip_name in result:  # 如果结果中已经有了这条ip_name的同义词了,则选取概率更高的
                    result[ip_name][0]['prob'] = 1
                else:  # 如果结果中没有这个ip_name
                    result[ip_name] = load_result(
                        prob=1, ip_type=ip_types[self.ip_type], name=ip_word
                    )  # 数据对应 'name':[{'prob':1, 'type':'3', 'name':'一代宗师'}]
                continue  # 如果一个ip_word是非噪音ip,则将此ip的prob设为1

        except:
            error = "saishi_rule_filter error. traceback: %s" % traceback.format_exc(
            )
            self.err_logger.error(error)
        return result, model_predict_words
Example #3
0
    def rule_filter(self, name_position_data, text):
        '''
        规则过滤
        :return:
        '''
        result = {
        }  # {'name1':[{'prob':1, 'type':'3', 'name':'一代宗师'}],  'name2':[{}]}
        judge_exit = False  # 用来判断一句话中是否存在《ip》的情况
        model_predict_words = {}  # 保存的是需要过模型的数据
        try:
            # 获取经过反向规则判断得到的剩下的数据
            names_list = name_position_data.keys()
            contain_name = remove_ip_rule(self.ip_words, name_position_data,
                                          text, names_list)

            for ip_word in contain_name:  # 遍历每一个ip词
                ip_name = self.ip_words[ip_word][0][
                    'ip_name']  # 存储对应ip_name中的ip_name
                if ip_name in result:  # 如果此条ip_name 已经存在结果集里面了
                    continue
                # 根据basic_tools中的规则进行判断
                basic_judge_result = predicate_ip_rule(self.ip_words,
                                                       ip_word,
                                                       text,
                                                       name_position_data,
                                                       contain_name,
                                                       quotes=True,
                                                       three_related=True)
                if basic_judge_result:
                    result[ip_name] = load_result(
                        prob=1, ip_type=ip_types[self.ip_type], name=ip_word)
                    continue
                # 走完规则,将此ip放到过模型的数据中,此ip不能是type_noise_ip = 1 的数据
                if not self.ip_words[ip_word][0]['type_noise_ip']:
                    model_predict_words[ip_word] = name_position_data[
                        ip_word]  # 只是一个复制过程,格式完全相同
        except:
            error = "yingshi_rule_filter error. traceback: %s" % traceback.format_exc(
            )
            self.err_logger.error(error)
        if judge_exit:  # 如果已经有了《左耳》这种类型,则不对后面的影视ip词进行判定
            return result, {}
        else:
            return result, model_predict_words
Example #4
0
    def rule_filter(self, name_position_data, text, names_data, name_tag):
        '''
       体育类明星的过滤规则
       :return:
       '''
        result = {
        }  # {'name1':[{'prob':1, 'type':'3', 'name':'一代宗师'}],  'name2':[{}]}
        model_predict_words = {}  # 保存的是需要过模型的数据
        try:
            # 获取经过反向规则判断得到的剩下的数据
            names_list = names_data.keys()
            contain_name = remove_ip_rule(self.ip_words, name_position_data,
                                          text, names_list)

            for ip_word in contain_name:
                i_n = names_data[ip_word]
                ip_name = self.ip_words[ip_word][i_n][
                    'ip_name']  # 存储对应ip_name中的ip_name
                if ip_name in result:  # 如果此条ip_name 已经存在结果集里面了
                    continue
                # 根据basic_tools中的规则进行判断
                basic_judge_result = predicate_ip_rule(self.ip_words,
                                                       ip_word,
                                                       text,
                                                       name_position_data,
                                                       contain_name,
                                                       three_related=True)
                if basic_judge_result:
                    result[ip_name] = load_result(
                        prob=1, ip_type=ip_types[self.ip_type], name=ip_word)
                    continue
                # 规则3:对汤普森和亚当斯进行判断
                if ip_word in ['汤普森', '亚当斯']:
                    judge_fdeterminer = True
                    if ip_word == '汤普森':
                        fdeterminer_words = [
                            '詹姆斯', '乐福', '韦德', '罗斯', '卡戴珊', '骑士', '特里斯坦', '斯坦',
                            'tt汤普森'
                        ]
                        for fdeterminer in fdeterminer_words:
                            if fdeterminer in text:
                                judge_fdeterminer = True
                                result['特里斯坦·特雷沃·詹姆斯·汤普森'] = load_result(
                                    prob=1,
                                    ip_type=ip_types[self.ip_type],
                                    name=ip_word)
                                break
                        if judge_fdeterminer:
                            continue
                    elif ip_word == '亚当斯':
                        fdeterminer_words = ['新疆', '外援']
                        for fdeterminer in fdeterminer_words:
                            if fdeterminer in text:
                                judge_fdeterminer = True
                                result['达柳斯·亚当斯'] = load_result(
                                    prob=1,
                                    ip_type=ip_types[self.ip_type],
                                    name=ip_word)
                                break
                        if judge_fdeterminer:
                            continue
                model_predict_words[ip_word] = name_position_data[ip_word]
        except:
            error = "sport_stars__rule_filter error. traceback: %s" % traceback.format_exc(
            )
            self.err_logger.error(error)
        return result, model_predict_words
Example #5
0
    def rule_filter(self, name_position_data, text):
        '''
        影视类明星的过滤规则
        :return:
        '''
        result = {
        }  # {'name1':[{'prob':1, 'type':'3', 'name':'一代宗师'}],  'name2':[{}]}
        try:
            # 获取经过反向规则判断得到的剩下的数据
            names_list = name_position_data.keys()
            contain_name = remove_ip_rule(self.ip_words, name_position_data,
                                          text, names_list)

            # 根据保留下来的数据来判断是否是ip
            for ip_word in contain_name:  # 遍历每一个ip词
                ip_name = self.ip_words[ip_word][0][
                    'ip_name']  # 存储对应ip_name中的ip_name
                if ip_name in result:  # 如果此条ip_name 已经存在结果集里面了
                    continue
                # 根据范围内出现(化名)来去除
                judge_noise_word = False
                judge_noise_break = False
                for position in name_position_data[ip_word]:
                    start = position[0] - 24 if (position[0] - 24) > 0 else 0
                    end = position[1] + 24 if (
                        position[1] + 24 < len(text)) else (len(text))
                    judge_scope = text[start:end]
                    for noise_word in ['化名', '编辑', '实习', '记者', '/摄', '观察员']:
                        if noise_word in judge_scope:
                            judge_noise_word = True
                            judge_noise_break = True
                            break
                    if judge_noise_break:
                        break
                if judge_noise_word:  # 如果规则4存在
                    continue
                # 根据basic_tools中的规则进行判断
                basic_judge_result = predicate_ip_rule(self.ip_words,
                                                       ip_word,
                                                       text,
                                                       name_position_data,
                                                       contain_name,
                                                       three_related=True)
                if basic_judge_result:
                    result[ip_name] = load_result(
                        prob=1, ip_type=ip_types[self.ip_type], name=ip_word)
                    continue
                # 规则4,一定范围内出现词汇,进行判断
                judge_words = copy.deepcopy(star_part_text_related_words)
                other_names = copy.deepcopy(contain_name)
                other_names.remove(ip_word)
                for other_name in other_names:
                    if not self.ip_words[ip_word][0][
                            'noise_ip']:  # 加其他ip的name加入附近判断的行列中
                        judge_words.append(other_name)
                judge_part_break = False
                for position in name_position_data[ip_word]:
                    judge_break = False
                    start = position[0] - 45 if (position[0] - 45) > 0 else 0
                    end = position[1] + 45 if (
                        position[1] + 45 < len(text)) else (len(text))
                    judge_scope = text[start:end]
                    for word in judge_words:
                        if word in judge_scope:
                            result[ip_name] = load_result(
                                prob=1,
                                ip_type=ip_types[self.ip_type],
                                name=ip_word)
                            judge_break = True
                            judge_part_break = True
                            break
                    if judge_break:
                        break
                if judge_part_break:  # 如果规则4存在
                    continue
                # 规则5,更小的范围内判断
                least_judge_words = star_least_text_related_words
                judge_least_break = False
                for position in name_position_data[ip_word]:
                    judge_break = False
                    start = position[0] - 24 if (position[0] - 24) > 0 else 0
                    end = position[1] + 24 if (
                        position[1] + 24 < len(text)) else (len(text))
                    judge_scope = text[start:end]
                    for word in least_judge_words:
                        if word in judge_scope:
                            result[ip_name] = load_result(
                                prob=1,
                                ip_type=ip_types[self.ip_type],
                                name=ip_word)
                            judge_break = True
                            judge_least_break = True
                            break
                    if judge_break:
                        break
                if judge_least_break:  # 如果规则4存在
                    continue
                # 规则5,前后100个字内判断存在相关性词汇
                for position in name_position_data[ip_word]:
                    start = position[0] - 150 if (position[0] - 150) > 0 else 0
                    end = position[1] + 150 if (
                        position[1] + 150 < len(text)) else (len(text))
                    judge_scope = text[start:end]
                    full_text_name_position = self.star_ip_index.query(
                        judge_scope)
                    if full_text_name_position:  # 如果范围内没有
                        result[ip_name] = load_result(
                            prob=1,
                            ip_type=ip_types[self.ip_type],
                            name=ip_word)
                        break
        except:
            error = "stars_yingshi_rule_filter error. traceback: %s" % traceback.format_exc(
            )
            self.err_logger.error(error)
        return result, []