def review_model_predict_entities(model_predict_entities): word_tag_map = POSTokenizer().word_tag_tab idf_freq = TFIDF().idf_freq reviewed_entities = defaultdict(list) for ent_type, ent_and_sent_list in model_predict_entities.items(): for ent, sent in ent_and_sent_list: start = sent.lower().find(ent) if start == -1: continue start += 1 end = start + len(ent) - 1 tokens = jieba.lcut(sent) offset = 0 selected_tokens = [] for token in tokens: offset += len(token) if offset >= start: selected_tokens.append(token) if offset >= end: break fixed_entity = ''.join(selected_tokens) fixed_entity = re.sub(r'\d*\.?\d+%$', '', fixed_entity) if ent_type == '人物': if len(fixed_entity) >= 10: continue if len(fixed_entity) <= 1: continue if re.findall(r'^\d+$', fixed_entity): continue if word_tag_map.get(fixed_entity, '') == 'v' and idf_freq[fixed_entity] < 7: continue reviewed_entities[ent_type].append(fixed_entity) return reviewed_entities
class Jieba(object): def __init__(self, config): self.config = config self.dt = POSTokenizer() def load_dict(self): self.dict = [] try: with open(self.config.get("word_dict", None), "r") as frobj: for line in frobj: content = line.strip().split("\t")[0] self.dict.append(content) except: raise ValueError("Not existed word piece dict") def load_model(self): for word in self.dict: self.dt.add_word(word) def build_word_id(self): self.word2id, self.id2word = {}, {} for index, word in enumerate(self.dict): self.word2id[word] = index self.id2word[index] = word def add_extra_word(self, extra_lst=[ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]" ]): extra_lst = extra_lst if extra_lst else self.config.get( "extra_lst", []) if len(extra_lst) >= 1: for word in extra_lst: if word in self.dict: self.dict.remove(word) self.dict = extra_lst + self.dict def train_model(self, train_config=None): config = train_config if train_config else self.config self.dict = [] try: with open(config.get("word_dict", None)) as frobj: for line in frobj: content = line.strip().split("\t")[0] self.dict.append(content) except: raise ValueError(" not existed word dict") def tokenize(self, text): tokenized_text = self.dt.lcut(text) return [list(word)[0] for word in tokenized_text] def convert_tokens_to_ids(self, text, unk="[UNK]"): tokenized_text = self.tokenize(text) token_id_lst = [ self.word2id.get(word, self.word2id[unk]) for word in tokenized_text ] return token_id_lst
def __init__(self): self.token = jieba.Tokenizer() file = [ x.path for x in os.scandir(config.JIEBA_DICT_PATH) if x.path.endswith("txt") ] for fp in file: self.token.load_userdict(fp) self.pos_token = POSTokenizer(self.token)
class Jieba_CHAR(object): def __init__(self, config): print("----------using naive cut tool---------") self.config = config self.dt = POSTokenizer() def load_vocab(self, vocab_lst=None): try: self.word2id = {} for index, word in enumerate(vocab_lst): self.dt.add_word(word, 1e5) self.word2id[word] = index print("==total vocab==", len(self.word2id)) except: print("==not included word list==") def tokenize(self, text): out = [] char_pattern = re.compile(u"[\u4e00-\u9fa5]+") word_list = list(self.dt.lcut("".join(text.split()))) for word in word_list: word = list(word) char_cn = char_pattern.findall(word[0]) if len(char_cn) >= 1: for item in word[0]: if len(item) >= 1: out.append(item) else: if len(word[0]) >= 1: out.append(word[0]) return out def convert_tokens_to_ids(self, token_lst, max_length): token_id_lst = [self.word2id["<pad>"] for _ in range(max_length)] for index, word in enumerate(token_lst[0:max_length]): if word in self.word2id: token_id_lst[index] = self.word2id[word] else: token_id_lst[index] = self.word2id["<unk>"] return token_id_lst def covert_tokens_to_char_ids(self, token_lst, max_length, char_len=5): char_id_lst = [[self.word2id["<pad>"] for _ in range(char_len)] for _ in range(max_length)] for index, word in enumerate(token_lst[0:max_length]): for char_index, char in enumerate(word[0:char_len]): if char in self.word2id: char_id_lst[index][char_index] = self.word2id[char] else: char_id_lst[index][char_index] = self.word2id["<unk>"] return char_id_lst
class jieba_api(object): def __init__(self): print("----------using jieba cut tool---------") def init_config(self, config): self.config = config self.dt = POSTokenizer() def build_tool(self): dict_path = self.config.get("user_dict", None) if dict_path is not None: import codecs with codecs.open(dict_path, "r", "utf-8") as frobj: lines = frobj.read().splitlines() for line in lines: content = line.split("\t") self.dt.add_word(content[0], int(content[1]), content[2]) def cut(self, text): words = list(self.dt.cut(text)) # print(words, " ".join([word for word in words if len(word) >= 1])) return " ".join([word for word in words if len(word) >= 1])
class TextTool: def __init__(self): self.token = jieba.Tokenizer() file = [ x.path for x in os.scandir(config.JIEBA_DICT_PATH) if x.path.endswith("txt") ] for fp in file: self.token.load_userdict(fp) self.pos_token = POSTokenizer(self.token) def lcut(self, query): return self.token.lcut(query) def pos_lcut(self, query): return self.pos_token.lcut(query)
def import_jieba_posseg(dt=None): from jieba.posseg import POSTokenizer dt_pos = POSTokenizer(tokenizer=dt) return dt_pos
def __init__(self, config): self.config = config self.dt = POSTokenizer()
def jieba_wrap_init(): global posseg_tok posseg_tok = POSTokenizer(jieba.dt)
def __init__(self, config): print("----------using naive cut tool---------") self.config = config self.dt = POSTokenizer()
def init_config(self, config): self.config = config self.dt = POSTokenizer() self.cut_flag = False self.word_type = [] self.dt.add_word("<SEG>", 10000)
class cut_tool_api(object): def __init__(self): print("----------using naive cut tool---------") def init_config(self, config): self.config = config self.dt = POSTokenizer() self.cut_flag = False self.word_type = [] self.dt.add_word("<SEG>", 10000) def build_tool(self): import codecs self.word_type = [] try: dict_path = self.config.get("user_dict", None) with codecs.open(dict_path, "r", "utf-8") as frobj: lines = frobj.read().splitlines() for line in lines: content = line.split("\t") try: self.dt.add_word(content[0], int(content[1]), content[2]) self.word_type.append(content[2]) except: continue self.dt.add_word("$$", 100000, "<split_symbol>") print("====succeeded in loading dictionary====", dict_path) self.word_type = list(set(self.word_type)) self.word_type = [ item for item in self.word_type if len(item) >= 1 ] self.word_type.append("$$") except: self.cut_flag = False def cut(self, text, target=None): out = [] char_pattern = re.compile(u"[\u4e00-\u9fa5]+") word_list = list(self.dt.lcut("".join(text.split()))) for word in word_list: word = list(word) if len(word[0]) == 0: continue if self.cut_flag: if word[1] in self.word_type: if target: if word[0] == target: out.append("<target>") else: out.append(word[1]) else: out.append(word[0]) else: char_cn = char_pattern.findall(word[0]) if len(char_cn) >= 1: for item in word[0]: if len(item) >= 1: out.append(item) else: if len(word[0]) >= 1: out.append(word[0]) else: char_cn = char_pattern.findall(word[0]) if len(char_cn) >= 1: for item in word[0]: if len(item) >= 1: out.append(item) else: if len(word[0]) >= 1: out.append(word[0]) return " ".join(out)
def init_config(self, config): self.config = config self.dt = POSTokenizer()