コード例 #1
0
ファイル: align_text.py プロジェクト: isikus/errant_manip
def getAutoAlignedEdits(orig, cor, spacy, args):
	# Save the spacy object globally.
	global NLP
	NLP = spacy
	# Get a list of strings from the spacy objects.
	orig_toks = [tok.text for tok in orig]
	cor_toks = [tok.text for tok in cor]
	# Align using Levenshtein.
	if args.lev: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=levSubstitution, transposition=levTransposition)
	# Otherwise, use linguistically enhanced Damerau-Levenshtein
	else: alignments = DL.WagnerFischer(orig_toks, cor_toks, orig, cor, substitution=token_substitution)
	# Get the alignment with the highest score. There is usually only 1 best in DL due to custom costs.
	alignment = next(alignments.alignments(True)) # True uses Depth-first search.
	# Convert the alignment into edits; choose merge strategy
	if args.merge == "rules": edits = get_edits(orig, cor, get_opcodes(alignment))
	elif args.merge == "all-split": edits = get_edits_split(get_opcodes(alignment))
	elif args.merge == "all-merge": edits = get_edits_group_all(get_opcodes(alignment))
	elif args.merge == "all-equal": edits = get_edits_group_type(get_opcodes(alignment))
	proc_edits = []
	for edit in edits:
		orig_start = edit[1]
		orig_end = edit[2]
		cat = "NA" # Auto edits do not have human types.
		cor_start = edit[3]
		cor_end = edit[4]
		cor_str = " ".join(cor_toks[cor_start:cor_end])
		proc_edits.append([orig_start, orig_end, cat, cor_str, cor_start, cor_end])
	return proc_edits
コード例 #2
0
ファイル: align_text.py プロジェクト: sethips/edit-extraction
def getAutoAlignedEdits(orig_str, cor_str, orig, cor, spacy, lev,
                        merge_strategy):
    global NLP
    NLP = spacy
    # Align using Levenshtein.
    if lev:
        alignments = DL.WagnerFischer(orig_str, cor_str, orig, cor, \
                substitution=levSubstitution, transposition=levTransposition)
    # Otherwise, use Linguistically enhanced Damerau-Levenshtein
    else:
        alignments = DL.WagnerFischer(orig_str,
                                      cor_str,
                                      orig,
                                      cor,
                                      substitution=token_substitution)
    # Get the first best alignment. For DL, there will likely be only 1 due to custom costs.
    alignment = next(alignments.alignments())
    # Convert the alignment into edits; choose merge strategy
    if merge_strategy == "rules":
        edits = get_edits(orig, cor, get_opcodes(alignment))
    elif merge_strategy == "all-split":
        edits = get_edits_split(orig, cor, get_opcodes(alignment))
    elif merge_strategy == "all-merge":
        edits = get_edits_group_all(orig, cor, get_opcodes(alignment))
    elif merge_strategy == "all-equal":
        edits = get_edits_group_type(orig, cor, get_opcodes(alignment))
    proc_edits = []
    for edit in edits:
        orig_start = edit[1]
        orig_end = edit[2]
        cat = "NA"  # Auto edits do not have human types.
        cor_start = edit[3]
        cor_end = edit[4]
        cor_tok = " ".join(cor_str[cor_start:cor_end])
        proc_edits.append(
            [orig_start, orig_end, cat, cor_tok, cor_start, cor_end])
    return proc_edits
コード例 #3
0
ファイル: align_text.py プロジェクト: adsglass/errant
def char_cost(A, B):
	alignments = DL.WagnerFischer(A, B)
	alignment = next(alignments.alignments(True))	# True uses Depth-first search.
	return alignments.cost / float(len(alignment))