def remove_ip_rule(ip_words, name_position_data, text, names_list): contain_name = copy.deepcopy(names_list) remove_ip_list = [] # 保存,因为反向词、前后字母、无法分词需要去除的数据 for ip_word in names_list: # 前提规则:反相关性进行判定 if ip_words[ip_word][0]['fdeterminer']: judge_fdeterminer = False fdeterminer_words = ip_words[ip_word][0]['fdeterminer'].split( '、') # 非霍奇金、淋巴瘤 for fdeterminer in fdeterminer_words: if fdeterminer in text: # 当出现NFL的时候,出现了非霍奇金或者淋巴瘤,则认为非IP词 judge_fdeterminer = True # 认为出现了反向限定词 break if judge_fdeterminer: # 出现反向限定词,跳过此ip_word进行下一个 remove_ip_list.append(ip_word) continue # 规则0,前后有字母的数据去掉 if ip_words[ip_word][0]['is_english_name'] == 1: # 如果是英文名字 remove_position_list = [] for position in name_position_data[ ip_word]: # [(position1), (position2)] start_pos = position[0] end_pos = position[1] if (text[start_pos - 1].isalnum() ) and start_pos > 0: # 如果前一或后一是字母或数字,此处非IP remove_position_list.append(position) continue if len(text) > end_pos: if text[end_pos].isalnum(): # 如果前一或后一是字母或数字,此处非IP remove_position_list.append(position) continue for remove_data in remove_position_list: # name_position_data去除position数据 name_position_data[ip_word].remove(remove_data) if len(name_position_data[ip_word]) == 0: # 如果没有position数据 remove_ip_list.append(ip_word) continue # 规则0-1,需要分词的中文IP的数据,无法分词的去掉 if ip_words[ip_word][0]['is_cut_word'] == 1: remove_position_list = [] for position in name_position_data[ip_word]: start = position[0] - 60 if (position[0] - 60) > 0 else 0 end = position[1] + 60 if (position[1] + 60 < len(text)) else ( len(text)) words_cut = norm_cut(text[start:end]) word_list = [] for word in words_cut: word_list.append(word.encode('utf-8')) if ip_word not in word_list: remove_position_list.append(position) for remove_data in remove_position_list: name_position_data[ip_word].remove(remove_data) if len(name_position_data[ip_word]) == 0: # 如果没有position数据 remove_ip_list.append(ip_word) for rm_ip in remove_ip_list: # 根据去除规则,去除的数据 contain_name.remove(rm_ip) return contain_name
def cut_word(self, sents): # 分词 words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)] # HMM=False num = 0 # 处理同义词 for w in words: if w in self.sysn.keys(): words[num] = self.sysn[w] num += 1 return words
def cut_input(input, posFlag): ''' cut a input string, return utf-8 string ''' if posFlag == True: result = norm_seg(input) wordsList = [] for w in result: wordsList.append(w.word + '_' + w.flag) words = " ".join(wordsList) else: words = " ".join(norm_cut(input)) #return words.encode('utf-8') return words
def cut_word(self, sents): """ 分词 """ words = [ i.encode('utf-8', 'ignore') for i in norm_cut(sents) if i not in self.c_need_password ] # HMM=False # 替换同义词 num = 0 for w in words: if w in self.sysn.keys(): words[num] = self.sysn[w] num += 1 # 寻找主语 # if len(words) >= 1: # if self.keyword and (words[0] in self.filter_word or words[0] in c_need_subject): # if len(words) <= 4: # words.insert(0, self.keyword) return words
def cut_word(self, sents): # 分词 words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)] # HMM=False return words