def getAutoAlignedEdits(orig, cor, spacy, args): # Save the spacy object globally. global NLP NLP = spacy # Get a list of strings from the spacy objects. orig_toks = [tok.text for tok in orig] cor_toks = [tok.text for tok in cor] # Align using Levenshtein. if args.lev: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=levSubstitution, transposition=levTransposition) # Otherwise, use linguistically enhanced Damerau-Levenshtein else: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=token_substitution) # Get the alignment with the highest score. There is usually only 1 best in DL due to custom costs. alignment = next(alignments.alignments(True)) # True uses Depth-first search. # Convert the alignment into edits; choose merge strategy if args.merge == "rules": edits = get_edits(orig, cor, get_opcodes(alignment)) elif args.merge == "all-split": edits = get_edits_split(get_opcodes(alignment)) elif args.merge == "all-merge": edits = get_edits_group_all(get_opcodes(alignment)) elif args.merge == "all-equal": edits = get_edits_group_type(get_opcodes(alignment)) proc_edits = [] for edit in edits: orig_start = edit[1] orig_end = edit[2] cat = "NA" # Auto edits do not have human types. cor_start = edit[3] cor_end = edit[4] cor_str = " ".join(cor_toks[cor_start:cor_end]) proc_edits.append([orig_start, orig_end, cat, cor_str, cor_start, cor_end]) return proc_edits
def getAutoAlignedEdits(orig_str, cor_str, orig, cor, spacy, lev, merge_strategy): global NLP NLP = spacy # Align using Levenshtein. if lev: alignments = DL.WagnerFischer(orig_str, cor_str, orig, cor, \ substitution=levSubstitution, transposition=levTransposition) # Otherwise, use Linguistically enhanced Damerau-Levenshtein else: alignments = DL.WagnerFischer(orig_str, cor_str, orig, cor, substitution=token_substitution) # Get the first best alignment. For DL, there will likely be only 1 due to custom costs. alignment = next(alignments.alignments()) # Convert the alignment into edits; choose merge strategy if merge_strategy == "rules": edits = get_edits(orig, cor, get_opcodes(alignment)) elif merge_strategy == "all-split": edits = get_edits_split(orig, cor, get_opcodes(alignment)) elif merge_strategy == "all-merge": edits = get_edits_group_all(orig, cor, get_opcodes(alignment)) elif merge_strategy == "all-equal": edits = get_edits_group_type(orig, cor, get_opcodes(alignment)) proc_edits = [] for edit in edits: orig_start = edit[1] orig_end = edit[2] cat = "NA" # Auto edits do not have human types. cor_start = edit[3] cor_end = edit[4] cor_tok = " ".join(cor_str[cor_start:cor_end]) proc_edits.append( [orig_start, orig_end, cat, cor_tok, cor_start, cor_end]) return proc_edits
def char_cost(A, B): alignments = DL.WagnerFischer(A, B) alignment = next(alignments.alignments(True)) # True uses Depth-first search. return alignments.cost / float(len(alignment))