def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") # ExitStack lets us process an arbitrary number of files line by line simultaneously. # See https://stackoverflow.com/questions/24108769/how-to-read-and-process-multiple-files-simultaneously-in-python print("Processing files...") with ExitStack() as stack: in_files = [ stack.enter_context(open(i)) for i in [args.orig] + args.cor ] # Process each line of all input files. for line_id, line in enumerate(zip(*in_files)): orig_sent = line[0].strip() cor_sents = line[1:] # If orig sent is empty, skip the line if not orig_sent: continue # Write the original sentence to the output m2 file. out_m2.write("S " + orig_sent + "\n") # Markup the original sentence with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.split(), nlp) # Loop through the corrected sentences for cor_id, cor_sent in enumerate(cor_sents): cor_sent = cor_sent.strip() # Identical sentences have no edits, so just write noop. if orig_sent == cor_sent: out_m2.write( "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||" + str(cor_id) + "\n") # Otherwise, do extra processing. else: # Markup the corrected sentence with spacy (assume tokenized) proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write( toolbox.formatEdit(auto_edit, cor_id) + "\n") # Write a newline when we have processed all corrections for a given sentence. out_m2.write("\n")
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Processing files...") # Open the original and corrected text files. with open(args.orig) as orig, open(args.cor) as cor: # Process each pre-aligned sentence pair. for orig_sent, cor_sent in zip(orig, cor): # Write the original sentence to the output m2 file. out_m2.write("S " + orig_sent) # Identical sentences have no edits, so just write noop. if orig_sent.strip() == cor_sent.strip(): out_m2.write( "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") # Otherwise, do extra processing. else: # Markup the parallel sentences with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp) proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit) + "\n") # Write a newline when there are no more edits. out_m2.write("\n")
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Processing files...") # Open the original and corrected text files. src_dict = defaultdict( list ) # Dict to store Source sentences as keys and edits as values , including multiple annotators for the same source sentence. src_line_present = False with open(args.orig) as orig, open(args.cor) as cor: # Process each pre-aligned sentence pair. for orig_sent, cor_sent in zip(orig, cor): # Write the original sentence to the output m2 file. #out_m2.write("S "+orig_sent) src_sent = "S " + orig_sent if src_sent not in src_dict.keys(): src_dict[src_sent] = [] src_line_present = False # Boolean variable to check if source sentence already present in dictionary src_line = src_sent src_lp_count = 0 # Variable to store how many times source line is already present. #src_lp_count also keeps track of annotator IDs to be written to m2 file. else: src_line_present = True src_line = src_sent src_lp_count += 1 # Identical sentences have no edits, so just write noop. if orig_sent.strip() == cor_sent.strip(): #out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") src_dict[src_sent].append( "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") # Otherwise, do extra processing. else: # Markup the parallel sentences with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp) proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. edit_to_be_written = toolbox.formatEdit(auto_edit) if (src_line_present == False): src_dict[src_line].append(edit_to_be_written + "\n") else: src_dict[src_line].append(edit_to_be_written[:-1] + str(src_lp_count) + "\n") # Finally write the source sentences(keys) and edits(values) to the m2 file for source_sent in src_dict.copy(): out_m2.write(source_sent) for k in range(len(src_dict[source_sent])): out_m2.write(src_dict[source_sent][k]) out_m2.write('\n')
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Punctuation normalisation dictionary norm_dict = {"’": "'", "´": "'", "‘": "'", "′": "'", "`": "'", '“': '"', '”': '"', '˝': '"', '¨': '"', '„': '"', '『': '"', '』': '"', '–': '-', '—': '-', '―': '-', '¬': '-', '、': ',', ',': ',', ':': ':', ';': ';', '?': '?', '!': '!', 'ِ': ' ', '\u200b': ' '} norm_dict = {ord(k): v for k, v in norm_dict.items()} # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Preprocessing files...") # Open the file with open(args.json_file) as data: # Process each line for line in data: # Load the JSON line line = json.loads(line) # Normalise certain punctuation in the text text = line["text"].translate(norm_dict) # Store the sentences and edits for all annotators here coder_dict = {} # Loop through the annotator ids and their edits for coder, edits in line["edits"]: # Add the coder to the coder_dict if needed if coder not in coder_dict: coder_dict[coder] = [] # Split the essay into paragraphs and update and normalise the char edits para_info = getParas(text, edits, norm_dict) # Loop through the paragraphs and edits for orig_para, para_edits in para_info: # Remove unnecessary whitespace from para and update char edits orig_para, para_edits = cleanPara(orig_para, para_edits) if not orig_para: continue # Ignore empty paras # Annotate orig_para with spacy orig_para = nlp(orig_para) # Convert character edits to token edits para_edits = getTokenEdits(orig_para, para_edits, nlp) # Split the paragraph into sentences and update tok edits sents = getSents(orig_para, para_edits) # Save the sents in the coder_dict coder_dict[coder].extend(sents) # Get the sorted coder ids coder_ids = sorted(coder_dict.keys()) # Loop through the sentences for the first coder for sent_id, sent in enumerate(coder_dict[0]): # Write the original sentence to the output M2 file out_m2.write("S "+" ".join(sent["orig"])+"\n") # Annotate the original sentence with spacy orig_sent = toolbox.applySpacy(sent["orig"], nlp) # Loop through the coders for coder in coder_ids: # Annotate the corrected sentence with spacy and get the gold edits cor_sent = toolbox.applySpacy(coder_dict[coder][sent_id]["cor"], nlp) gold_edits = coder_dict[coder][sent_id]["edits"] # Gold edits if args.gold: # Make sure edits are ordered in terms of start, then end offsets. gold_edits = sorted(gold_edits, key=itemgetter(0)) # Sort by start offset gold_edits = sorted(gold_edits, key=itemgetter(1)) # Sort by end offset min_edits = [] # Loop through the gold edits. for gold_edit in gold_edits: # Minimise correction (not detection D) edits: e.g. [has eaten -> eaten] = [has -> ε] if gold_edit[2] == "C": gold_edit = toolbox.minimiseEdit(gold_edit, orig_sent, cor_sent) # Classify and save non-empty edits if gold_edit: cat = cat_rules.autoTypeEdit(gold_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer) gold_edit[2] = cat min_edits.append(gold_edit) # If there are no minimised edits, write an explicit empty edit if not min_edits: out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n") # Otherwise loop through the edits and write them to the output m2 file. for edit in min_edits: out_m2.write(toolbox.formatEdit(edit, coder)+"\n") # Auto edits elif args.auto: # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits(orig_sent, cor_sent, nlp, args) # If there are no edits, write an explicit noop edit. if not auto_edits: out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n") # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n") # Write new line after each sentence when we reach last coder. out_m2.write("\n")
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Processing files...") # Open the m2 file and split into sentence+edit chunks. m2_file = open(args.m2).read().strip().split("\n\n") for info in m2_file: # Get the original and corrected sentence + edits for each annotator. orig_sent, coder_dict = toolbox.processM2(info) # Write the orig_sent to the output m2 file. out_m2.write("S "+" ".join(orig_sent)+"\n") # Markup the original sentence with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent, nlp) # Loop through the annotators for coder, coder_info in sorted(coder_dict.items()): cor_sent = coder_info[0] gold_edits = coder_info[1] # Markup the corrected sentence with spacy (assume tokenized) proc_cor = toolbox.applySpacy(cor_sent, nlp) # Gold edits if args.gold: # Loop through the gold edits. for gold_edit in gold_edits: # Write noop edits to the output m2 file. if gold_edit[2] == "noop": out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") continue # Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was] if not args.max_edits: gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor) # If minimised to nothing, the edit disappears. if not gold_edit: continue # Give the edit an automatic error type. if not args.old_cats: cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) gold_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") # Auto edits elif args.auto: # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args) # If there are no edits, write an explicit noop edit. if not auto_edits: out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n") # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n") # Write a newline when there are no more coders. out_m2.write("\n")