def search_longest_mats(self, line): ''' 将line中信息抽取成mat ''' max_len, word = 0, '' i = 0 while i < len(line): w = self.longest_prefix(line[i:]) if w != '' and len(w) > max_len: word = w start_pos = i end_pos = i + len(w) i += len(w) max_len = len(w) else: i += 1 if word: ratio = 1.0 * len(word) / len(line) mat = KVObj(group=None, rank=-1, type_name=None, vpat=None, word=word, start_pos=start_pos, end_pos=end_pos, ratio=ratio) return mat return []
def search_all_mats(self, line): ''' 将line中信息抽取成mat ''' hits, obj = [], [] i = 0 while i < len(line): w = self.longest_prefix(line[i:]) if w != '': hits.append((w, i, i + len(w))) i += len(w) else: i += 1 for hit in hits: word = hit[0] start_pos = hit[1] end_pos = hit[2] ratio = 1.0 * len(word) / len(line) mat = KVObj(group=None, rank=-1, type_name=None, vpat=None, word=word, start_pos=start_pos, end_pos=end_pos, ratio=ratio) obj.append(mat) return obj
def _make_vpats(self, contain_vpats, vpats, dict_pats, long=True): ''' 用于模板中字典的获取 ''' kvpats, kvdicts, kv_contain = [], [], [] # 解析包含不包含 group, rank, type_name = '', '', '' for contain_pat in contain_vpats: contain_pat = contain_pat.split('=') kname = contain_pat[0] pats = contain_pat[1].split(',') info = kname.split('_') group = info[0].replace('$', '') rank = int(info[-1].replace('rank', '')) type_name = 're_expr' for pat in pats: norm_pat, wrong_pat = self._match_not_match(pat) if norm_pat and wrong_pat: contain_pat = KVObj(group=group, rank=rank, type_name=type_name, vpat=(norm_pat, wrong_pat)) kv_contain.append(contain_pat) # 解析正则表达式 vpats_num = len(vpats) group, rank, type_name = '', '', '' for i, vpat in enumerate(vpats): vpat = vpat.split('=') kname = vpat[0] pats = '='.join(vpat[1:]).split(',') # yes_rank1 info = kname.split('_') if info[0].replace('$', '') != group: if group: kv_pat = KVObj(group=group, rank=rank, type_name=type_name, vpat=inner_pats) kvpats.append(kv_pat) group = info[0].replace('$', '') rank = int(info[-1].replace('rank', '')) type_name = 'regu_expr' inner_pats = pats else: inner_pats.extend(pats) if i == vpats_num - 1: kv_pat = KVObj(group=group, rank=rank, type_name=type_name, vpat=inner_pats) kvpats.append(kv_pat) # 解析关键词词典 group, rank, type_name = '', '', '' for vdict in dict_pats: vdict = vdict.split('=') kname = vdict[0] words = vdict[1].replace('[', '').replace(']', '') words = words.split(',') info = kname.split('_') group = info[0].replace('$', '') rank = int(info[-1].replace('rank', '')) type_name = 'dict' kv_dict = KVObj(group=group, rank=rank, type_name=type_name, vpat=words) kvdicts.append(kv_dict) kvdicts = self._combine_pats(kvdicts) return kv_contain, kvpats, kvdicts
def parse(self, line, keywordVersion='', long=True): ''' 返回匹配到的结果 ''' match_mat, re_mat, dict_mat = None, None, None if long: if keywordVersion: sorted_group = self.parsed_pats[keywordVersion][0] # else: # logger.fatal('<%s> and <%s> does not exist' % (questionId, keywordVersion)) # sys.exit(-1) else: if keywordVersion: sorted_group = self.parsed_pats[keywordVersion][1] # else: # logger.fatal('<%s> and <%s> does not exist' % (questionId, keywordVersion)) # sys.exit(-1) for rank_group in sorted_group: match_group = rank_group[0] # 包含不包含group re_group = rank_group[1] # 每个group中有两种匹配方法,正则表达式group dict_group = rank_group[2] # trie group # 先匹配,匹配不匹配。<>内是不匹配的内容,<>外时匹配的内容 for mat in match_group: pats = mat.vpat for pat in pats: match_pat = pat[0] not_match_pat = pat[1] res_match = match_pat.search(line) res_not_match = not_match_pat.search(line) if res_match and not res_not_match: pos = res_match.span() if pos: start_pos = pos[0] end_pos = pos[1] ratio = 1.0 * (end_pos - start_pos) / len(line) word = res_match.group() match_mat = KVObj(group=mat.group, rank=mat.rank, type_name=mat.type, word=word, start_pos=start_pos, end_pos=end_pos, ratio=ratio) break if match_mat: break # 第二步,匹配正则表达式中内容 for mat in re_group: pats = mat.vpat for pat in pats: compiled_pat, origin_pat = pat[0], pat[1] res_mat = compiled_pat.search(line) if res_mat: pos = res_mat.span() if pos: start_pos = pos[0] end_pos = pos[1] ratio = 1.0 * (end_pos - start_pos) / len(line) re_mat = KVObj(group=mat.group, rank=mat.rank, type_name=mat.type, word=origin_pat, start_pos=start_pos, end_pos=end_pos, ratio=ratio) break if re_mat: break # 第三步,匹配字典中出现内容 for mat in dict_group: trie = mat.vpat dict_mat = trie.search_longest_mats(line) if dict_mat: dict_mat.group = mat.group dict_mat.rank = mat.rank dict_mat.type = mat.type break if re_mat and dict_mat: if re_mat.ratio > dict_mat.ratio: return re_mat else: return dict_mat if re_mat: return re_mat if dict_mat: return dict_mat return None