def label_edits(pairs, args): annotator = errant.load("en") labels = [] # Process each line of all input files for orig, cors in tqdm(pairs): label = [] # Get the original and all the corrected texts orig = orig.strip() cors = [cors] # Skip the line if orig is empty if not orig: continue # Parse orig with spacy orig = annotator.parse(orig, args.tok) # Write orig to the output m2 file # Loop through the corrected texts for cor_id, cor in enumerate(cors): cor = cor.strip() # If the texts are the same, write a noop edit if orig.text.strip() == cor: label.append(noop_edit(cor_id).split('|||')[1]) # Otherwise, do extra processing else: # Parse cor with spacy cor = annotator.parse(cor, args.tok) # Align the texts and extract and classify the edits edits = annotator.annotate(orig, cor, args.lev, args.merge) # Loop through the edits for edit in edits: # Write the edit to the output m2 file label.append(edit.to_m2(cor_id).split('|||')[1]) # Write a newline when we have processed all corrections for each line labels.append(label) return labels
def main(): # Parse command line args args = parse_args() print("Loading resources...") # Load Errant annotator = errant.load("en") # Open output m2 file out_m2 = open(args.out, "w") print("Processing parallel files...") # Process an arbitrary number of files line by line simultaneously. Python 3.3+ # See https://tinyurl.com/y4cj4gth with ExitStack() as stack: orig_lines = stack.enter_context(open(args.orig, encoding='utf-8')).readlines() cor_lines = stack.enter_context(open(args.cor[0], encoding='utf-8')).readlines() pairs = list(zip(orig_lines, cor_lines)) batch_size = len(orig_lines) // args.n_procs splits = split(pairs, batch_size) partial_func = partial(label_edits, args=args) with Pool(args.n_procs) as pool: results = pool.map(partial_func, splits) labeled = merge(results) for label in tqdm(labeled): out_m2.write(','.join(label) + '\n')
def main(): # Parse command line args args = parse_args() print("Loading resources...") # Load Errant if args.lang == "English": annotator = errant.load("en") elif args.lang == "Russian": annotator = errant.load("ru") print("Processing parallel files...") # Process an arbitrary number of files line by line simultaneously. Python 3.3+ # See https://tinyurl.com/y4cj4gth . Also opens the output m2 file. with ExitStack() as stack, open(args.out, "w") as out_m2: in_files = [ stack.enter_context(open(i)) for i in [args.orig] + args.cor ] # Process each line of all input files for line in zip(*in_files): # Get the original and all the corrected texts orig = line[0].strip() cors = line[1:] # Skip the line if orig is empty if not orig: continue # Parse orig with spacy orig = annotator.parse(orig, args.tok) # Write orig to the output m2 file out_m2.write(" ".join(["S"] + [token.text for token in orig]) + "\n") # Loop through the corrected texts for cor_id, cor in enumerate(cors): cor = cor.strip() # If the texts are the same, write a noop edit if orig.text.strip() == cor: out_m2.write(noop_edit(cor_id) + "\n") # Otherwise, do extra processing else: # Parse cor with spacy cor = annotator.parse(cor, args.tok) # Align the texts and extract and classify the edits edits = annotator.annotate(orig, cor, args.lev, args.merge) # Loop through the edits for edit in edits: # Write the edit to the output m2 file out_m2.write(edit.to_m2(cor_id) + "\n") # Write a newline when we have processed all corrections for each line out_m2.write("\n")
def get_action(s1, s2): annotator = errant.load('en') orig = annotator.parse(s1) cor = annotator.parse(s2) edits = annotator.annotate(orig, cor) for e in edits: if 'R:' in e.type: return 'replace' elif 'M:' in e.type: return 'insert' elif 'U:' in e.type: return 'remove'
def main(): # Parse command line args args = parse_args() print("Loading resources...") # Load Errant based on the language annotator = errant.load(args.lang) # Open output m2 file out_m2 = open(args.out, "w", encoding='utf-8') print("Processing parallel files...") # Process an arbitrary number of files line by line simultaneously. Python 3.3+ # See https://tinyurl.com/y4cj4gth with ExitStack() as stack: in_files = [ stack.enter_context(open(i)) for i in [args.orig] + args.cor ] # Process each line of all input files for line in zip(*in_files): # Get the original and all the corrected texts orig = line[0].strip() cors = line[1:] # Skip the line if orig is empty if not orig: continue # Parse orig to get a list of ParsedToken objects orig = annotator.parse(orig, args.tok) # Write orig to the output m2 file out_m2.write(" ".join(["S"] + [token.text for token in orig]) + "\n") # Loop through the corrected texts for cor_id, cor in enumerate(cors): cor = cor.strip() # If the texts are the same, write a noop edit if ' '.join( o.text for o in orig ) == cor: # replace orig.text.strip() to ' '.join(o.text for o in orig) to get the original text out_m2.write(noop_edit(cor_id) + "\n") # Otherwise, do extra processing else: # Parse cor to get an array of ParsedToken objects cor = annotator.parse(cor, args.tok) # Align the texts and extract and classify the edits edits = annotator.annotate(orig, cor, args.lev, args.merge) # Loop through the edits for edit in edits: # Write the edit to the output m2 file out_m2.write(edit.to_m2(cor_id) + "\n") # Write a newline when we have processed all corrections for each line out_m2.write("\n")
def get_category(s1, s2): annotator = errant.load('en') orig = annotator.parse(s1) cor = annotator.parse(s2) edits = annotator.annotate(orig, cor) for e in edits: if 'DET' in e.type: return 'Articles' elif 'PREP' in e.type or 'PART' in e.type: return 'Preposition' elif 'PUNCT' in e.type or get_diff(e.o_str, e.c_str)[0] in punc: return 'Punctuation' elif 'VERB' in e.type or set(ing).issubset( set(get_diff(e.o_str, e.c_str))): if 'SVA' in e.type: return 'Subject Verb Agreement' else: return 'Verb Form' elif 'NOUN' in e.type or 'ADJ' in e.type or 'MORPH' in e.type or 'SPELL' in e.type or 'ORTH' in e.type: return 'Word Form' else: return 'Other'
def get_explanation(s1, s2): annotator = errant.load('en') orig = annotator.parse(s1) cor = annotator.parse(s2) edits = annotator.annotate(orig, cor) error = get_category(s1, s2) if error == 'Verb Form': for e in edits: if 'TENSE' in e.type: if 'R:' in e.type: return f"Verb tense error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Verb tense error, should insert '{e.c_str}'." else: return f"Verb tense error, should remove '{e.o_str}'." elif 'FORM' in e.type: if 'R:' in e.type: return f"Verb form error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Verb form error, should insert '{e.c_str}'." else: return f"Verb form error, should remove '{e.o_str}'." elif set(ing).issubset(set(get_diff(e.o_str, e.c_str))): if 'R:' in e.type: return f"Present continuous tense, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Present continuous tense, should insert '{e.c_str}'." else: return f"Present continuous tense, should remove '{e.o_str}'." else: if 'R:' in e.type: return f"Other verb error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Other verb error, should insert '{e.c_str}'." else: return f"Other verb error, should remove '{e.o_str}'." elif error == 'Word Form': for e in edits: if 'NUM' in e.type: if 'R:' in e.type: return f"Noun Number error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Noun Number error, should insert '{e.c_str}'." else: return f"Noun Number error, should remove '{e.o_str}'." elif 'ADJ' in e.type: if 'R:' in e.type: return f"Adjective error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Adjective error, should insert '{e.c_str}'." else: return f"Adjective error, should remove '{e.o_str}'." elif 'MORPH' in e.type: if 'R:' in e.type: return f"Morphology error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Morphology error, should insert '{e.c_str}'." else: return f"Morphology error, should remove '{e.o_str}'." elif 'ORTH' in e.type: if 'R:' in e.type: return f"Orthography error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Orthography error, should insert '{e.c_str}'." else: return f"Orthography error, should remove '{e.o_str}'." else: if 'R:' in e.type: return f"Other word form error, should replace '{e.o_str}' with '{e.c_str}'." elif 'M:' in e.type: return f"Other word form error, should insert '{e.c_str}'." else: return f"Other word form error, should remove '{e.o_str}'." elif error == 'Punctuation': for e in edits: if 'M:' in e.type: expla = f"Consider add punctuation '{e.c_str}' in your sentence." return expla if 'R:' in e.type: expla = f"Consider change the punctuation into '{e.c_str}'." return expla if 'U:' in e.type: expla = f"Please remove the unnecessary punctuation '{e.c_str}'." return expla elif error == 'Subject Verb Agreement': for e in edits: if True: expla = f"Please check the subject-verb agreement, choose the approate format for verb '{e.c_str}'." return expla elif error == 'Articles': for e in edits: if 'R:' in e.type: expla = f"Consider article '{e.c_str}' in front of countable or singular nouns referring to people or things what have not already been mentioned." return expla elif 'M:' in e.type: expla = f"Article '{e.c_str}' is required because of the countable or singular nouns referring to people or things what have not already been mentioned." return expla elif 'U:' in e.type: return 'No article required' elif error == 'Preposition': for e in edits: if 'R:' in e.type: expla = f"Consider '{e.c_str}' to be the proper preposition." return expla elif 'M:' in e.type: expla = f"You need a preposition '{e.c_str}'before a noun or pronoun to show place, position, time or method." return expla elif 'U:' in e.type: expla = f"You don't need preposition '{e.o_str}' here, consider to remove it." return expla else: return 'Others'
def main(): # Parse command line args args = parse_args() print("Loading resources...") # Load Errant annotator = errant.load("en") # Open output M2 file out_m2 = open(args.out, "w") print("Processing M2 file...") # Open the m2 file and split it into text+edit blocks m2 = open(args.m2_file).read().strip().split("\n\n") # Loop through the blocks for m2_block in m2: m2_block = m2_block.strip().split("\n") # Write the original text to the output M2 file out_m2.write(m2_block[0] + "\n") # Parse orig with spacy orig = annotator.parse(m2_block[0][2:]) # Simplify the edits and sort by coder id edit_dict = simplify_edits(m2_block[1:]) # Loop through coder ids for id, raw_edits in sorted(edit_dict.items()): # If the first edit is a noop if raw_edits[0][2] == "noop": # Write the noop and continue out_m2.write(noop_edit(id) + "\n") continue # Apply the edits to generate the corrected text # Also redefine the edits as orig and cor token offsets cor, gold_edits = get_cor_and_edits(m2_block[0][2:], raw_edits) # Parse cor with spacy cor = annotator.parse(cor) # Save detection edits here for auto det_edits = [] # Loop through the gold edits for gold_edit in gold_edits: # Do not minimise detection edits if gold_edit[-2] in {"Um", "UNK"}: edit = annotator.import_edit(orig, cor, gold_edit[:-1], min=False, old_cat=args.old_cats) # Overwrite the pseudo correction and set it in the edit edit.c_toks = annotator.parse(gold_edit[-1]) # Save the edit for auto det_edits.append(edit) # Write the edit for gold if args.gold: # Write the edit out_m2.write(edit.to_m2(id) + "\n") # Gold annotation elif args.gold: edit = annotator.import_edit(orig, cor, gold_edit[:-1], not args.no_min, args.old_cats) # Write the edit out_m2.write(edit.to_m2(id) + "\n") # Auto annotations if args.auto: # Auto edits edits = annotator.annotate(orig, cor, args.lev, args.merge) # Combine detection and auto edits and sort by orig offsets edits = sorted(det_edits + edits, key=lambda e: (e.o_start, e.o_end)) # Write the edits to the output M2 file for edit in edits: out_m2.write(edit.to_m2(id) + "\n") # Write a newline when there are no more edits out_m2.write("\n")
import errant # # annotator = errant.load('en') # orig = annotator.parse('This are gramamtical sentence .') # cor = annotator.parse('This is a grammatical sentence .') # edit = [1, 2, 1, 2, 'SVA'] # are -> is # edit = annotator.import_edit(orig, cor, edit) # print(edit.to_m2()) annotator = errant.load('en') orig = annotator.parse('This are gramamtical sentence .') cor = annotator.parse('This is a grammatical sentence .') alignment = annotator.align(orig, cor) edits = annotator.merge(alignment) for e in edits: e = annotator.classify(e) print(e)
def main(): # Parse command line args args = parse_args() print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Load Errant annotator = errant.load("en", nlp) # Punctuation normalisation dictionary norm_dict = { "’": "'", "´": "'", "‘": "'", "′": "'", "`": "'", '“': '"', '”': '"', '˝': '"', '¨': '"', '„': '"', '『': '"', '』': '"', '–': '-', '—': '-', '―': '-', '¬': '-', '、': ',', ',': ',', ':': ':', ';': ';', '?': '?', '!': '!', 'ِ': ' ', '\u200b': ' ' } norm_dict = {ord(k): v for k, v in norm_dict.items()} # Open output M2 file out_m2 = open(args.out, "w") print("Preprocessing files...") # Open the file with open(args.json_file) as data: # Process each line for line in data: # Load the JSON line line = json.loads(line) # Normalise certain punctuation in the text text = line["text"].translate(norm_dict) # Store the sentences and edits for all annotators here coder_dict = {} # Loop through the annotator ids and their edits for coder, edits in line["edits"]: # Add the coder to the coder_dict if needed if coder not in coder_dict: coder_dict[coder] = [] # Split the essay into paras and update and normalise the char edits para_info = get_paras(text, edits, norm_dict) # Loop through the paras and edits for orig_para, para_edits in para_info: # Remove unnecessary whitespace from para and update char edits orig_para, para_edits = clean_para(orig_para, para_edits) if not orig_para: continue # Ignore empty paras # Convert character edits to token edits based on spacy tokenisation orig_para = nlp(orig_para) para_edits = get_token_edits(orig_para, para_edits, nlp) # Split the paragraph into sentences, if needed, and update tok edits sents = get_sents(orig_para, para_edits, sent_tokenised=True) # Save the sents in the coder_dict coder_dict[coder].extend(sents) # Document level M2 file. Merge the text as a single long string if args.docs: coder_dict = doc_m2(coder_dict) # Get the sorted coder ids coder_ids = sorted(coder_dict.keys()) # Loop through the sentences for the first coder for sent_id, sent in enumerate(coder_dict[0]): # Write the original sentence to the output M2 file out_m2.write("S " + " ".join(sent["orig"]) + "\n") # Annotate the original sentence with spacy orig = annotator.parse(" ".join(sent["orig"])) # Loop through the coders for id in coder_ids: # Annotate the corrected sentence with spacy and get the gold edits cor = annotator.parse(" ".join( coder_dict[id][sent_id]["cor"])) gold_edits = coder_dict[id][sent_id]["edits"] # Gold edits if args.gold: # Make sure edits are ordered by orig start and end offsets. gold_edits = sorted(gold_edits, key=itemgetter(0)) # Start gold_edits = sorted(gold_edits, key=itemgetter(1)) # End proc_edits = [] # Loop through the gold edits. for gold_edit in gold_edits: # Format the edit for errant import gold_edit = gold_edit[:2] + gold_edit[-2:] + [ gold_edit[2] ] # Detection edits (never minimised) if gold_edit[-1] == "D": gold_edit = annotator.import_edit( orig, cor, gold_edit, min=False, old_cat=args.old_cats) # Correction edits else: gold_edit = annotator.import_edit( orig, cor, gold_edit, not args.no_min, args.old_cats) # Ignore edits that have been minimised to nothing if gold_edit.o_start == gold_edit.o_end and \ not gold_edit.c_str: continue # Save the edit in proc edits proc_edits.append(gold_edit) # If there are no edits, write an explicit noop edit. if not proc_edits: out_m2.write(noop_edit(id) + "\n") # Write the edits to the output M2 file for edit in proc_edits: out_m2.write(edit.to_m2(id) + "\n") # Auto edits elif args.auto: auto_edits = annotator.annotate( orig, cor, args.lev, args.merge) # If there are no edits, write an explicit noop edit. if not auto_edits: out_m2.write(noop_edit(id) + "\n") # Write the edits to the output M2 file for edit in auto_edits: out_m2.write(edit.to_m2(id) + "\n") # Write new line after each sentence when we reach last coder. out_m2.write("\n")
import logging import requests import tornado.web from time import time from gector.gec_model import GecBERTModel from utils.helpers import add_sents_idx, add_tokens_idx, token_level_edits, forward_merge_corrections, backward_merge_corrections from copy import deepcopy import pprint import errant logging.basicConfig( format='%(levelname)s: [%(asctime)s][%(filename)s:%(lineno)d] %(message)s', level=logging.INFO) nlp = spacy.load("en") annotator = errant.load(lang='en', nlp=nlp) model = GecBERTModel( vocab_path="./data/output_vocabulary", model_paths=["./pretrain/roberta_1_gector.th"], # model_paths = ["./pretrain/bert_0_gector.th", "./pretrain/roberta_1_gector.th", "./pretrain/xlnet_0_gector.th"], model_name="roberta", is_ensemble=False, iterations=3, ) DEFAULT_CONFIG = { 'iterations': 3, 'min_probability': 0.5, 'min_error_probability': 0.7, 'case_sensitive': True,