Exemple #1
0
def get_ins_dels(incorr_line, correct_line):
    ins = defaultdict(int)
    dels = defaultdict(int)
    rejected = 0

    incorr_tokens = custom_tokenize(incorr_line, wordpiece_tokenizer, mode="train")
    correct_tokens = custom_tokenize(correct_line, wordpiece_tokenizer, mode="train")
    diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens)

    for item in diffs:
        if item[0]=="+":
            if len(item[2:].split())>2:
                return defaultdict(int), defaultdict(int), 1
            ins[item[2:]]+=1
        elif item[0]=="-":
            dels[item[2:]]+=1

    return ins,dels,0
Exemple #2
0
def seq2edits(incorr_line, correct_line):
    # Seq2Edits function (Described in Section 2.2 of the paper)
    # obtains edit ids from incorrect and correct tokens
    # input: incorrect line and correct line
    # output: incorr_tokens, correct_tokens,  incorr token ids, edit ids

    # tokenize incorr_line and correct_line
    incorr_tokens = custom_tokenize(incorr_line,
                                    wordpiece_tokenizer,
                                    mode="train")
    correct_tokens = custom_tokenize(correct_line,
                                     wordpiece_tokenizer,
                                     mode="train")
    # generate diffs using modified edit distance algorith
    # (Described in Appendix A.1 of the paper)
    diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens)
    # align diffs to get edits
    edit_ids = diffs_to_edits(diffs)

    if not edit_ids:
        return None
    # get incorrect token ids
    incorr_tok_ids = wordpiece_tokenizer.convert_tokens_to_ids(incorr_tokens)
    return incorr_tokens, correct_tokens, incorr_tok_ids, edit_ids