Example #1
0
def remove_ip_rule(ip_words, name_position_data, text, names_list):
    contain_name = copy.deepcopy(names_list)
    remove_ip_list = []  # 保存,因为反向词、前后字母、无法分词需要去除的数据
    for ip_word in names_list:
        # 前提规则:反相关性进行判定
        if ip_words[ip_word][0]['fdeterminer']:
            judge_fdeterminer = False
            fdeterminer_words = ip_words[ip_word][0]['fdeterminer'].split(
                '、')  # 非霍奇金、淋巴瘤
            for fdeterminer in fdeterminer_words:
                if fdeterminer in text:  # 当出现NFL的时候,出现了非霍奇金或者淋巴瘤,则认为非IP词
                    judge_fdeterminer = True  # 认为出现了反向限定词
                    break
            if judge_fdeterminer:  # 出现反向限定词,跳过此ip_word进行下一个
                remove_ip_list.append(ip_word)
                continue
        # 规则0,前后有字母的数据去掉
        if ip_words[ip_word][0]['is_english_name'] == 1:  # 如果是英文名字
            remove_position_list = []
            for position in name_position_data[
                    ip_word]:  # [(position1), (position2)]
                start_pos = position[0]
                end_pos = position[1]
                if (text[start_pos - 1].isalnum()
                    ) and start_pos > 0:  # 如果前一或后一是字母或数字,此处非IP
                    remove_position_list.append(position)
                    continue
                if len(text) > end_pos:
                    if text[end_pos].isalnum():  # 如果前一或后一是字母或数字,此处非IP
                        remove_position_list.append(position)
                        continue
            for remove_data in remove_position_list:  # name_position_data去除position数据
                name_position_data[ip_word].remove(remove_data)
            if len(name_position_data[ip_word]) == 0:  # 如果没有position数据
                remove_ip_list.append(ip_word)
                continue
        # 规则0-1,需要分词的中文IP的数据,无法分词的去掉
        if ip_words[ip_word][0]['is_cut_word'] == 1:
            remove_position_list = []
            for position in name_position_data[ip_word]:
                start = position[0] - 60 if (position[0] - 60) > 0 else 0
                end = position[1] + 60 if (position[1] + 60 < len(text)) else (
                    len(text))
                words_cut = norm_cut(text[start:end])
                word_list = []
                for word in words_cut:
                    word_list.append(word.encode('utf-8'))
                if ip_word not in word_list:
                    remove_position_list.append(position)
            for remove_data in remove_position_list:
                name_position_data[ip_word].remove(remove_data)
            if len(name_position_data[ip_word]) == 0:  # 如果没有position数据
                remove_ip_list.append(ip_word)
    for rm_ip in remove_ip_list:  # 根据去除规则,去除的数据
        contain_name.remove(rm_ip)
    return contain_name
Example #2
0
 def cut_word(self, sents):
     # 分词
     words = [i.encode('utf-8', 'ignore')
              for i in norm_cut(sents)]  # HMM=False
     num = 0
     # 处理同义词
     for w in words:
         if w in self.sysn.keys():
             words[num] = self.sysn[w]
         num += 1
     return words
Example #3
0
def cut_input(input, posFlag):
    '''
    cut a input string, return utf-8 string
    '''

    if posFlag == True:
        result = norm_seg(input)
        wordsList = []
        for w in result:
            wordsList.append(w.word + '_' + w.flag)
        words = " ".join(wordsList)
    else:
        words = " ".join(norm_cut(input))
    #return words.encode('utf-8')
    return words
Example #4
0
 def cut_word(self, sents):
     """
     分词
     """
     words = [
         i.encode('utf-8', 'ignore') for i in norm_cut(sents)
         if i not in self.c_need_password
     ]  # HMM=False
     # 替换同义词
     num = 0
     for w in words:
         if w in self.sysn.keys():
             words[num] = self.sysn[w]
         num += 1
         # 寻找主语
     # if len(words) >= 1:
     #     if self.keyword and (words[0] in self.filter_word or words[0] in c_need_subject):
     #         if len(words) <= 4:
     #             words.insert(0, self.keyword)
     return words
Example #5
0
 def cut_word(self, sents):
     # 分词
     words = [i.encode('utf-8', 'ignore')
              for i in norm_cut(sents)]  # HMM=False
     return words