def get_ins_dels(incorr_line, correct_line): ins = defaultdict(int) dels = defaultdict(int) rejected = 0 incorr_tokens = custom_tokenize(incorr_line, wordpiece_tokenizer, mode="train") correct_tokens = custom_tokenize(correct_line, wordpiece_tokenizer, mode="train") diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens) for item in diffs: if item[0]=="+": if len(item[2:].split())>2: return defaultdict(int), defaultdict(int), 1 ins[item[2:]]+=1 elif item[0]=="-": dels[item[2:]]+=1 return ins,dels,0
def seq2edits(incorr_line, correct_line): # Seq2Edits function (Described in Section 2.2 of the paper) # obtains edit ids from incorrect and correct tokens # input: incorrect line and correct line # output: incorr_tokens, correct_tokens, incorr token ids, edit ids # tokenize incorr_line and correct_line incorr_tokens = custom_tokenize(incorr_line, wordpiece_tokenizer, mode="train") correct_tokens = custom_tokenize(correct_line, wordpiece_tokenizer, mode="train") # generate diffs using modified edit distance algorith # (Described in Appendix A.1 of the paper) diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens) # align diffs to get edits edit_ids = diffs_to_edits(diffs) if not edit_ids: return None # get incorrect token ids incorr_tok_ids = wordpiece_tokenizer.convert_tokens_to_ids(incorr_tokens) return incorr_tokens, correct_tokens, incorr_tok_ids, edit_ids