def detect(sentence): maybe_error_indices = set() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = tokenize(sentence) # 未登录词加入疑似错误字典 for word, begin_idx, end_idx in tokens: if word not in PUNCTUATION_LIST and word not in word_freq.keys(): for i in range(begin_idx, end_idx): maybe_error_indices.add(i) # 语言模型检测疑似错字 ngram_avg_scores = [] for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) # 合并字、词错误 maybe_error_indices |= set(maybe_error_char_indices) return sorted(maybe_error_indices)
def ccm_sort(self, sentence): """ """ # 加载排序词典 name_model = self.load_ccm_word_freq_dict(self.name_sort_path) maybe_errors = [] if not sentence.strip(): return maybe_errors # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) print(tokens) temp = None error_list = [] correct_list = [] new = [] i = -1 for word, begin_idx, end_idx in tokens: new.append(word) i += 1 if word in LINK_WORD: temp = None if name_model.get(word): if not temp: temp = name_model.get(word) continue else: if temp > name_model.get(word): p = tokens[i] tokens[i] = tokens[i - 2] tokens[i - 2] = p print(tokens[i][0]) print(tokens[i - 2][0]) correct_list.append((tokens[i][0], i)) correct_list.append((tokens[i - 2][0], i - 2)) error_list.append((tokens[i][0], i)) else: pass # print(tokens) # correct_list.append((tokens[i][0])) for word, p in correct_list: new[p] = word print(new) print("ls:" + str(correct_list)) correct = ''.join(new) print("correct:" + correct) # print(error_list) # print(tokens) # print(tokens[0]) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
def detect(self, text): maybe_errors = [] if not text.strip(): return maybe_errors # 初始化 self.check_detector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 文本归一化 text = uniform(text) # 长句切分为短句 blocks = self.split_2_short_text(text) for blk, idx in blocks: maybe_errors += self.detect_short(blk, idx) return maybe_errors
def detect(sentence): maybe_error_indices = set() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = tokenize(sentence) # 未登录词加入疑似错误字典 for word, begin_idx, end_idx in tokens: # fixed: pass num alpha if word.isalnum(): continue # punctuation if word in PUNCTUATION_LIST: continue # in dict if word in word_freq.keys(): continue for i in range(begin_idx, end_idx): maybe_error_indices.add(i) # 语言模型检测疑似错字 ngram_avg_scores = [] try: for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) if not scores: continue # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) # 合并字、词错误 maybe_error_indices |= set(maybe_error_char_indices) except IndexError as ie: default_logger.warn("index error, sentence:" + sentence + ie) except Exception as e: default_logger.warn("detect error, sentence:" + sentence + e) return sorted(maybe_error_indices)
def detect(self, sentence): maybe_errors = [] if not sentence.strip(): return maybe_errors # 初始化 self.check_detector_initialized() # 文本归一化 sentence = uniform(sentence) # 长句切分为短句 blocks = re_han.split(sentence) start_idx = 0 for blk in blocks: if not blk: continue if re_han.match(blk): maybe_errors += self._detect_short(blk, start_idx) start_idx += len(blk) else: start_idx += len(blk) return maybe_errors
def detect(sentence): maybe_error_indices = set() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = tokenize(sentence) # 未登录词加入疑似错误字典 for word, begin_idx, end_idx in tokens: if word not in PUNCTUATION_LIST and word not in word_freq.keys(): for i in range(begin_idx, end_idx): maybe_error_indices.add(i) # 语言模型检测疑似错字 ngram_avg_scores = [] try: for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) # 合并字、词错误 maybe_error_indices |= set(maybe_error_char_indices) except IndexError as ie: print("index error, sentence:", sentence, ie) pass except Exception as e: print("detect error, sentence:", sentence, e) return sorted(maybe_error_indices)
def name_job(self, sentence): """ """ # 加载人名-职务词典 job_model = self.load_ccm_job_freq_dict(self.leader_job_path) print(job_model) maybe_errors = [] if not sentence.strip(): return maybe_errors # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) print(tokens) # temp = None error_list = [] correct_list = [] new = [] i = 0 j = 0 for word, begin_idx, end_idx in tokens: if job_model.get(word): print(i) # 如果找到人名了,那么现在的i就是该人名的坐标 a = job_model.get(word) front = a.get('1') temp_list = [] for x in range(j, i): # j就是起点坐标,i就是终点坐标 if self.leader_job_freq_dict.get(tokens[x][0]): if tokens[x][0] not in front: temp_list.append(tokens[x][0]) if temp_list: error_list.append({word: temp_list}) else: pass j = i + 1 # 起点坐标变为上一个人坐标的下一位坐标 i += 1 print(error_list)
def detect(self, sentence): """ 检测句子中的疑似错误信息,包括[词、位置、错误类型] :param sentence: :return: list[list], [error_word, begin_pos, end_pos, error_type] """ maybe_errors = [] if not sentence.strip(): return maybe_errors # 初始化 self.check_detector_initialized() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) # print(tokens) # 自定义混淆集加入疑似错误词典 for confuse in self.custom_confusion: idx = sentence.find(confuse) if idx > -1: maybe_err = [confuse, idx, idx + len(confuse), ErrorType.confusion] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_word_error_detect: # 未登录词加入疑似错误词典 for word, begin_idx, end_idx in tokens: # pass filter word if self.is_filter_token(word): continue # pass in dict if word in self.word_freq: continue maybe_err = [word, begin_idx, end_idx, ErrorType.word] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_char_error_detect: # 语言模型检测疑似错误字 if self.enable_rnnlm: scores = self.char_scores(sentence) # 取疑似错字信息 for i in self._get_maybe_error_index_by_rnnlm(scores): token = sentence[i] # pass filter word if self.is_filter_token(token): continue maybe_err = [token, i, i + 1, ErrorType.char] # token, begin_idx, end_idx, error_type self._add_maybe_error_item(maybe_err, maybe_errors) else: try: ngram_avg_scores = [] for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = self.ngram_score(list(word)) scores.append(score) if not scores: continue # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence))] ngram_avg_scores.append(avg_scores) # 取拼接后的n-gram平均得分 sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) # 取疑似错字信息 for i in self._get_maybe_error_index(sent_scores): token = sentence[i] # pass filter word if self.is_filter_token(token): continue maybe_err = [token, i, i + 1, ErrorType.char] # token, begin_idx, end_idx, error_type self._add_maybe_error_item(maybe_err, maybe_errors) except IndexError as ie: logger.warn("index error, sentence:" + sentence + str(ie)) except Exception as e: logger.warn("detect error, sentence:" + sentence + str(e)) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
def detect(self, sentence): """ 检测句子中的疑似错误信息,包括[词、位置、错误类型] :param sentence: :return: [error_word, begin_pos, end_pos, error_type] """ maybe_errors = [] if not sentence.strip(): return maybe_errors self.check_detector_initialized() # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) # print(tokens) # 自定义混淆集加入疑似错误词典 for confuse in self.custom_confusion: idx = sentence.find(confuse) if idx > -1: maybe_err = [ confuse, idx, idx + len(confuse), error_type["confusion"] ] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_word_error_detect: # 未登录词加入疑似错误词典 for word, begin_idx, end_idx in tokens: # pass blank if not word.strip(): continue # punctuation if word in PUNCTUATION_LIST: continue # pass num if word.isdigit(): continue # pass alpha if is_alphabet_string(word.lower()): continue # in dict if word in self.word_freq: continue maybe_err = [word, begin_idx, end_idx, error_type["word"]] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_char_error_detect: # 语言模型检测疑似错误字 ngram_avg_scores = [] try: for n in [2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = self.ngram_score(list(word)) scores.append(score) if not scores: continue # 移动窗口补全得分 for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) # 取拼接后的ngram平均得分 sent_scores = list( np.average(np.array(ngram_avg_scores), axis=0)) # 取疑似错字信息 for i in self._get_maybe_error_index(sent_scores): maybe_err = [sentence[i], i, i + 1, error_type["char"]] self._add_maybe_error_item(maybe_err, maybe_errors) except IndexError as ie: logger.warn("index error, sentence:" + sentence + str(ie)) except Exception as e: logger.warn("detect error, sentence:" + sentence + str(e)) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
def detect(sentence): maybe_error_indices = set() sentence = uniform(sentence) tokens = tokenize(sentence) # unknown chars for word, begin_idx, end_idx in tokens: if word not in PUNCTUATION_LIST and word not in word_freq.keys(): for i in range(begin_idx, end_idx): maybe_error_indices.add(i) ngram_avg_scores = [] try: for n in [1, 2, 3]: scores = [] for i in range(len(sentence) - n + 1): word = sentence[i:i + n] score = get_ngram_score(list(word), mode=trigram_char) scores.append(score) for _ in range(n - 1): scores.insert(0, scores[0]) scores.append(scores[-1]) avg_scores = [ sum(scores[i:i + n]) / len(scores[i:i + n]) for i in range(len(sentence)) ] ngram_avg_scores.append(avg_scores) sent_scores = list(np.average(np.array(ngram_avg_scores), axis=0)) maybe_error_char_indices = _get_maybe_error_index(sent_scores) maybe_error_indices |= set(maybe_error_char_indices) except IndexError as ie: print("index error, sentence:", sentence, ie) pass except Exception as e: print("detect error, sentence:", sentence, e) # # to get rid of special nouns like name seg = pseg.lcut(sentence) # # in the form of list of pair(w.word, w.flag) word = [w.word for w in seg] tag = [w.flag for w in seg] for i in range(len(tag)): if tag[i] in {'nz', 'nr', 'nt', 'ns'}: if i > 0 and tag[i - 1] == 'd': continue if len(word[i]) > 1: maybe_error_indices -= set(range(len(''.join(word[:i])), \ len(''.join(word[:i + 1])))) elif i + 1 < len(tag) and tag[i + 1] in {'nz', 'nr', 'nt', 'ns'}: maybe_error_indices -= set(range(len(''.join(word[:i])), \ len(''.join(word[:i + 2])))) # if tag[i] == 'j' and len(word[i]) > 1: # maybe_error_indices -= set(range(len(''.join(word[:i])), \ # len(''.join(word[:i + 1])))) return sorted(maybe_error_indices)