def get_ins_dels(incorr_line, correct_line): ins = defaultdict(int) dels = defaultdict(int) rejected = 0 incorr_tokens = custom_tokenize(incorr_line, wordpiece_tokenizer, mode="train") correct_tokens = custom_tokenize(correct_line, wordpiece_tokenizer, mode="train") diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens) for item in diffs: if item[0]=="+": if len(item[2:].split())>2: return defaultdict(int), defaultdict(int), 1 ins[item[2:]]+=1 elif item[0]=="-": dels[item[2:]]+=1 return ins,dels,0
def get_tuple(line): if FLAGS.do_spell_check: line = line.strip().split() line = wordpiece_tokenizer.basic_tokenizer._run_spell_check(line) line = " ".join(line) tokens = custom_tokenize(line, wordpiece_tokenizer) token_ids = wordpiece_tokenizer.convert_tokens_to_ids(tokens) #print(tokens) #print(token_ids) return tokens, token_ids
def seq2edits(incorr_line, correct_line): # Seq2Edits function (Described in Section 2.2 of the paper) # obtains edit ids from incorrect and correct tokens # input: incorrect line and correct line # output: incorr_tokens, correct_tokens, incorr token ids, edit ids # tokenize incorr_line and correct_line incorr_tokens = custom_tokenize(incorr_line, wordpiece_tokenizer, mode="train") correct_tokens = custom_tokenize(correct_line, wordpiece_tokenizer, mode="train") # generate diffs using modified edit distance algorith # (Described in Appendix A.1 of the paper) diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens) # align diffs to get edits edit_ids = diffs_to_edits(diffs) if not edit_ids: return None # get incorrect token ids incorr_tok_ids = wordpiece_tokenizer.convert_tokens_to_ids(incorr_tokens) return incorr_tokens, correct_tokens, incorr_tok_ids, edit_ids
def get_tuple(wordpiece_tokenizer, line, do_spell_check): """ 进行tokenize. Args: wordpiece_tokenizer: class, tokenizer line: string, 输入文本 do_spell_check: boolean, 是否进行autocorrect的拼写检查 Return: tokens, token_ids """ if do_spell_check: line = line.strip().split() line = wordpiece_tokenizer.basic_tokenizer._run_spell_check(line) line = " ".join(line) tokens = custom_tokenize(line, wordpiece_tokenizer) token_ids = wordpiece_tokenizer.convert_tokens_to_ids(tokens) return tokens, token_ids