Esempio n. 1
0
def review_model_predict_entities(model_predict_entities):
    word_tag_map = POSTokenizer().word_tag_tab
    idf_freq = TFIDF().idf_freq
    reviewed_entities = defaultdict(list)
    for ent_type, ent_and_sent_list in model_predict_entities.items():
        for ent, sent in ent_and_sent_list:
            start = sent.lower().find(ent)
            if start == -1:
                continue
            start += 1
            end = start + len(ent) - 1
            tokens = jieba.lcut(sent)
            offset = 0
            selected_tokens = []
            for token in tokens:
                offset += len(token)
                if offset >= start:
                    selected_tokens.append(token)
                if offset >= end:
                    break

            fixed_entity = ''.join(selected_tokens)
            fixed_entity = re.sub(r'\d*\.?\d+%$', '', fixed_entity)
            if ent_type == '人物':
                if len(fixed_entity) >= 10:
                    continue
            if len(fixed_entity) <= 1:
                continue
            if re.findall(r'^\d+$', fixed_entity):
                continue
            if word_tag_map.get(fixed_entity,
                                '') == 'v' and idf_freq[fixed_entity] < 7:
                continue
            reviewed_entities[ent_type].append(fixed_entity)
    return reviewed_entities
Esempio n. 2
0
class Jieba(object):
    def __init__(self, config):
        self.config = config
        self.dt = POSTokenizer()

    def load_dict(self):
        self.dict = []
        try:
            with open(self.config.get("word_dict", None), "r") as frobj:
                for line in frobj:
                    content = line.strip().split("\t")[0]
                    self.dict.append(content)
        except:
            raise ValueError("Not existed word piece dict")

    def load_model(self):
        for word in self.dict:
            self.dt.add_word(word)

    def build_word_id(self):
        self.word2id, self.id2word = {}, {}
        for index, word in enumerate(self.dict):
            self.word2id[word] = index
            self.id2word[index] = word

    def add_extra_word(self,
                       extra_lst=[
                           "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"
                       ]):
        extra_lst = extra_lst if extra_lst else self.config.get(
            "extra_lst", [])
        if len(extra_lst) >= 1:
            for word in extra_lst:
                if word in self.dict:
                    self.dict.remove(word)
            self.dict = extra_lst + self.dict

    def train_model(self, train_config=None):
        config = train_config if train_config else self.config
        self.dict = []
        try:
            with open(config.get("word_dict", None)) as frobj:
                for line in frobj:
                    content = line.strip().split("\t")[0]
                    self.dict.append(content)
        except:
            raise ValueError(" not existed word dict")

    def tokenize(self, text):
        tokenized_text = self.dt.lcut(text)
        return [list(word)[0] for word in tokenized_text]

    def convert_tokens_to_ids(self, text, unk="[UNK]"):
        tokenized_text = self.tokenize(text)
        token_id_lst = [
            self.word2id.get(word, self.word2id[unk])
            for word in tokenized_text
        ]
        return token_id_lst
Esempio n. 3
0
 def __init__(self):
     self.token = jieba.Tokenizer()
     file = [
         x.path for x in os.scandir(config.JIEBA_DICT_PATH)
         if x.path.endswith("txt")
     ]
     for fp in file:
         self.token.load_userdict(fp)
     self.pos_token = POSTokenizer(self.token)
Esempio n. 4
0
class Jieba_CHAR(object):
	def __init__(self, config):
		print("----------using naive cut tool---------")
		self.config = config
		self.dt = POSTokenizer()

	def load_vocab(self, vocab_lst=None):
		try:
			self.word2id = {}
			for index, word in enumerate(vocab_lst):
				self.dt.add_word(word, 1e5)
				self.word2id[word] = index
			print("==total vocab==", len(self.word2id))
		except:
			print("==not included word list==")
		
	def tokenize(self, text):
		out = []
		char_pattern = re.compile(u"[\u4e00-\u9fa5]+")
		word_list = list(self.dt.lcut("".join(text.split())))
		for word in word_list:
			word = list(word)
			char_cn = char_pattern.findall(word[0])
			if len(char_cn) >= 1:
				for item in word[0]:
					if len(item) >= 1:
						out.append(item)
			else:
				if len(word[0]) >= 1:
					out.append(word[0])
		return out

	def convert_tokens_to_ids(self, token_lst, max_length):
		token_id_lst = [self.word2id["<pad>"] for _ in range(max_length)]
		for index, word in enumerate(token_lst[0:max_length]):
			if word in self.word2id:
				token_id_lst[index] = self.word2id[word]
			else:
				token_id_lst[index] = self.word2id["<unk>"]
		return token_id_lst

	def covert_tokens_to_char_ids(self, token_lst, max_length, char_len=5):
		char_id_lst = [[self.word2id["<pad>"] for _ in range(char_len)] for _ in range(max_length)]
		for index, word in enumerate(token_lst[0:max_length]):
			for char_index, char in enumerate(word[0:char_len]):
				if char in self.word2id:
					char_id_lst[index][char_index] = self.word2id[char]
				else:
					char_id_lst[index][char_index] = self.word2id["<unk>"]
		return char_id_lst
Esempio n. 5
0
class jieba_api(object):
    def __init__(self):
        print("----------using jieba cut tool---------")

    def init_config(self, config):
        self.config = config
        self.dt = POSTokenizer()

    def build_tool(self):
        dict_path = self.config.get("user_dict", None)
        if dict_path is not None:
            import codecs
            with codecs.open(dict_path, "r", "utf-8") as frobj:
                lines = frobj.read().splitlines()
                for line in lines:
                    content = line.split("\t")
                    self.dt.add_word(content[0], int(content[1]), content[2])

    def cut(self, text):
        words = list(self.dt.cut(text))
        # print(words, " ".join([word for word in words if len(word) >= 1]))
        return " ".join([word for word in words if len(word) >= 1])
Esempio n. 6
0
class TextTool:
    def __init__(self):
        self.token = jieba.Tokenizer()
        file = [
            x.path for x in os.scandir(config.JIEBA_DICT_PATH)
            if x.path.endswith("txt")
        ]
        for fp in file:
            self.token.load_userdict(fp)
        self.pos_token = POSTokenizer(self.token)

    def lcut(self, query):
        return self.token.lcut(query)

    def pos_lcut(self, query):
        return self.pos_token.lcut(query)
Esempio n. 7
0
def import_jieba_posseg(dt=None):
    from jieba.posseg import POSTokenizer
    dt_pos = POSTokenizer(tokenizer=dt)

    return dt_pos
Esempio n. 8
0
 def __init__(self, config):
     self.config = config
     self.dt = POSTokenizer()
Esempio n. 9
0
def jieba_wrap_init():
	global posseg_tok
	posseg_tok = POSTokenizer(jieba.dt)
Esempio n. 10
0
	def __init__(self, config):
		print("----------using naive cut tool---------")
		self.config = config
		self.dt = POSTokenizer()
Esempio n. 11
0
 def init_config(self, config):
     self.config = config
     self.dt = POSTokenizer()
     self.cut_flag = False
     self.word_type = []
     self.dt.add_word("<SEG>", 10000)
Esempio n. 12
0
class cut_tool_api(object):
    def __init__(self):
        print("----------using naive cut tool---------")

    def init_config(self, config):
        self.config = config
        self.dt = POSTokenizer()
        self.cut_flag = False
        self.word_type = []
        self.dt.add_word("<SEG>", 10000)

    def build_tool(self):
        import codecs
        self.word_type = []
        try:
            dict_path = self.config.get("user_dict", None)
            with codecs.open(dict_path, "r", "utf-8") as frobj:
                lines = frobj.read().splitlines()
                for line in lines:
                    content = line.split("\t")
                    try:
                        self.dt.add_word(content[0], int(content[1]),
                                         content[2])
                        self.word_type.append(content[2])
                    except:
                        continue
            self.dt.add_word("$$", 100000, "<split_symbol>")
            print("====succeeded in loading dictionary====", dict_path)
            self.word_type = list(set(self.word_type))
            self.word_type = [
                item for item in self.word_type if len(item) >= 1
            ]
            self.word_type.append("$$")
        except:
            self.cut_flag = False

    def cut(self, text, target=None):
        out = []
        char_pattern = re.compile(u"[\u4e00-\u9fa5]+")
        word_list = list(self.dt.lcut("".join(text.split())))
        for word in word_list:
            word = list(word)
            if len(word[0]) == 0:
                continue
            if self.cut_flag:
                if word[1] in self.word_type:
                    if target:
                        if word[0] == target:
                            out.append("<target>")
                        else:
                            out.append(word[1])
                    else:
                        out.append(word[0])
                else:
                    char_cn = char_pattern.findall(word[0])
                    if len(char_cn) >= 1:
                        for item in word[0]:
                            if len(item) >= 1:
                                out.append(item)
                    else:
                        if len(word[0]) >= 1:
                            out.append(word[0])
            else:
                char_cn = char_pattern.findall(word[0])
                if len(char_cn) >= 1:
                    for item in word[0]:
                        if len(item) >= 1:
                            out.append(item)
                else:
                    if len(word[0]) >= 1:
                        out.append(word[0])
        return " ".join(out)
Esempio n. 13
0
 def init_config(self, config):
     self.config = config
     self.dt = POSTokenizer()