def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") # ExitStack lets us process an arbitrary number of files line by line simultaneously. # See https://stackoverflow.com/questions/24108769/how-to-read-and-process-multiple-files-simultaneously-in-python print("Processing files...") with ExitStack() as stack: in_files = [ stack.enter_context(open(i)) for i in [args.orig] + args.cor ] # Process each line of all input files. for line_id, line in enumerate(zip(*in_files)): orig_sent = line[0].strip() cor_sents = line[1:] # If orig sent is empty, skip the line if not orig_sent: continue # Write the original sentence to the output m2 file. out_m2.write("S " + orig_sent + "\n") # Markup the original sentence with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.split(), nlp) # Loop through the corrected sentences for cor_id, cor_sent in enumerate(cor_sents): cor_sent = cor_sent.strip() # Identical sentences have no edits, so just write noop. if orig_sent == cor_sent: out_m2.write( "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||" + str(cor_id) + "\n") # Otherwise, do extra processing. else: # Markup the corrected sentence with spacy (assume tokenized) proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write( toolbox.formatEdit(auto_edit, cor_id) + "\n") # Write a newline when we have processed all corrections for a given sentence. out_m2.write("\n")
def _generate_m2(orig_sent, cor_sent): ignore_count = 0 out_m2_str = '' # Process each pre-aligned sentence pair. try: # Check sentence length: if len(orig_sent.strip().split()) < 3: raise Exception('Source sentence is too short.') if len(cor_sent.strip().split()) < 3: raise Exception('Target sentence is too short.') # Detokenize sents if they're pre-tokenized. Otherwise the result will be wrong. if args.is_tokenized_orig: orig_sent = detokenizer.detokenize(orig_sent.strip().split(), return_str=True) if args.is_tokenized_cor: cor_sent = detokenizer.detokenize(cor_sent.strip().split(), return_str=True) # Markup the parallel sentences with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.strip(), nlp) proc_cor = toolbox.applySpacy(cor_sent.strip(), nlp) # Write the original sentence to the output m2 file. out_m2_str += "S " + toolbox.formatProcSent( proc_orig, feature_delimiter=args.feature_delimiter) + "\n" out_m2_str += "T " + toolbox.formatProcSent( proc_cor, feature_delimiter=args.feature_delimiter) + "\n" # out_m2.write("S " + toolbox.formatProcSent(proc_orig, feature_delimiter=args.feature_delimiter) + "\n") # out_m2.write("T " + toolbox.formatProcSent(proc_cor, feature_delimiter=args.feature_delimiter) + "\n") # Identical sentences have no edits, so just write noop. if orig_sent.strip() == cor_sent.strip(): out_m2_str += "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n" # out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") # Otherwise, do extra processing. else: # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2_str += toolbox.formatEdit(auto_edit) + "\n" # out_m2.write(toolbox.formatEdit(auto_edit)+"\n") # Write a newline when there are no more edits. out_m2_str += "\n" # out_m2.write("\n") except KeyboardInterrupt: sys.exit(1) except: ignore_count += 1 print('\nIgnore example:') print('- Source: ', orig_sent) print('- Target: ', cor_sent) print() return out_m2_str, ignore_count
def main(args): basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Setup output m2 file based on corrected file name. m2_out = open(args.out if args.out.endswith(".m2") else args.out + ".m2", "w") print("Processing files...") with io.open(args.orig, encoding='utf-8') as orig, io.open(args.cor, encoding='utf-8') as cor: # Process each pre-aligned sentence pair. for orig_sent, cor_sent in zip(orig, cor): # Get the raw text. orig_sent = orig_sent.strip() cor_sent = cor_sent.strip() # Ignore empty sentences if not orig_sent and not cor_sent: continue # If args.tok, we also need to tokenise the text. if args.tok: orig_sent = nlp(orig_sent, tag=True, parse=True, entity=False) cor_sent = nlp(cor_sent, tag=True, parse=True, entity=False) # Otherwise, assume it is tokenized and then process. else: orig_sent = nlp.tokenizer.tokens_from_list(orig_sent.split()) cor_sent = nlp.tokenizer.tokens_from_list(cor_sent.split()) nlp.tagger(orig_sent) nlp.tagger(cor_sent) nlp.parser(orig_sent) nlp.parser(cor_sent) # Get a list of string toks for each. orig_toks = [tok.orth_ for tok in orig_sent] cor_toks = [tok.orth_ for tok in cor_sent] # Auto align the sentence and extract the edits. auto_edits = align_text.getAutoAlignedEdits( orig_toks, cor_toks, orig_sent, cor_sent, nlp, args.lev, args.merge) # Write orig_toks to output. m2_out.write("S " + " ".join(orig_toks) + "\n") # If there are no edits, write an explicit dummy edit. if not auto_edits: m2_out.write("A -1 -1|||noop||||||REQUIRED|||-NONE-|||0\n") # Write the auto edits to the file. for auto_edit in auto_edits: # Write the edit to output. m2_out.write(formatEdit(auto_edit) + "\n") # Write new line after each sentence. m2_out.write("\n")
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Processing files...") # Open the original and corrected text files. with open(args.orig) as orig, open(args.cor) as cor: # Process each pre-aligned sentence pair. for orig_sent, cor_sent in zip(orig, cor): # Write the original sentence to the output m2 file. out_m2.write("S " + orig_sent) # Identical sentences have no edits, so just write noop. if orig_sent.strip() == cor_sent.strip(): out_m2.write( "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") # Otherwise, do extra processing. else: # Markup the parallel sentences with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp) proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit) + "\n") # Write a newline when there are no more edits. out_m2.write("\n")
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Processing files...") # Open the m2 file and split into sentence+edit chunks. m2_file = open(args.m2).read().strip().split("\n\n") for info in m2_file: # Get the original and corrected sentence + edits for each annotator. orig_sent, coder_dict = toolbox.processM2(info) # Write the orig_sent to the output m2 file. out_m2.write("S "+" ".join(orig_sent)+"\n") # Only process sentences with edits. if coder_dict: # Save marked up original sentence here, if required. proc_orig = "" # Loop through the annotators for coder, coder_info in sorted(coder_dict.items()): cor_sent = coder_info[0] gold_edits = coder_info[1] # If there is only 1 edit and it is noop, just write it. if gold_edits[0][2] == "noop": out_m2.write(toolbox.formatEdit(gold_edits[0], coder)+"\n") continue # Markup the orig and cor sentence with spacy (assume tokenized) # Orig is marked up only once for the first coder that needs it. proc_orig = toolbox.applySpacy(orig_sent, nlp) if not proc_orig else proc_orig proc_cor = toolbox.applySpacy(cor_sent, nlp) # Loop through gold edits. for gold_edit in gold_edits: # Um and UNK edits (uncorrected errors) are always preserved. if gold_edit[2] in {"Um", "UNK"}: # Um should get changed to UNK unless using old categories. if gold_edit[2] == "Um" and not args.old_cats: gold_edit[2] = "UNK" out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") # Gold edits elif args.gold: # Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was] if not args.max_edits: gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor) # If minimised to nothing, the edit disappears. if not gold_edit: continue # Give the edit an automatic error type. if not args.old_cats: cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) gold_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") # Auto edits if args.auto: # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n") # Write a newline when there are no more coders. out_m2.write("\n")
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading SpaCy...") # Load Tokenizer and other resources print("Note: disable unecessary pipelines: ner, textcats") nlp = spacy.load("en_core_web_lg", disable=['ner', 'textcat']) # Lancaster Stemmer stemmer = LancasterStemmer() # Moses Detokenizer detokenizer = MosesDetokenizer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") # Compute missing examples count missing_count = 0 print("Processing files...") # Open the original and corrected text files. with open(args.orig) as orig, open(args.cor) as cor: # Process each pre-aligned sentence pair. for orig_sent, cor_sent in tqdm(zip(orig, cor)): try: # Check sentence length: if len(orig_sent.strip().split()) < 3: raise Exception('Source sentence is too short.') if len(cor_sent.strip().split()) < 3: raise Exception('Target sentence is too short.') # Detokenize sents if they're pre-tokenized. Otherwise the result will be wrong. if args.is_tokenized_orig: orig_sent = detokenizer.detokenize(orig_sent.strip().split(), return_str=True) if args.is_tokenized_cor: cor_sent = detokenizer.detokenize(cor_sent.strip().split(), return_str=True) # Markup the parallel sentences with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.strip(), nlp) proc_cor = toolbox.applySpacy(cor_sent.strip(), nlp) # Write the original sentence to the output m2 file. out_m2.write("S " + toolbox.formatProcSent(proc_orig, feature_delimiter=args.feature_delimiter) + "\n") out_m2.write("T " + toolbox.formatProcSent(proc_cor, feature_delimiter=args.feature_delimiter) + "\n") # Identical sentences have no edits, so just write noop. if orig_sent.strip() == cor_sent.strip(): out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") # Otherwise, do extra processing. else: # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit)+"\n") # Write a newline when there are no more edits. out_m2.write("\n") except KeyboardInterrupt: sys.exit(1) except: missing_count += 1 print('\nMissing count:', missing_count) print('- Source: ', orig_sent) print('- Target: ', cor_sent) print() continue
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Processing files...") # Open the original and corrected text files. src_dict = defaultdict( list ) # Dict to store Source sentences as keys and edits as values , including multiple annotators for the same source sentence. src_line_present = False with open(args.orig) as orig, open(args.cor) as cor: # Process each pre-aligned sentence pair. for orig_sent, cor_sent in zip(orig, cor): # Write the original sentence to the output m2 file. #out_m2.write("S "+orig_sent) src_sent = "S " + orig_sent if src_sent not in src_dict.keys(): src_dict[src_sent] = [] src_line_present = False # Boolean variable to check if source sentence already present in dictionary src_line = src_sent src_lp_count = 0 # Variable to store how many times source line is already present. #src_lp_count also keeps track of annotator IDs to be written to m2 file. else: src_line_present = True src_line = src_sent src_lp_count += 1 # Identical sentences have no edits, so just write noop. if orig_sent.strip() == cor_sent.strip(): #out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") src_dict[src_sent].append( "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n") # Otherwise, do extra processing. else: # Markup the parallel sentences with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp) proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, args) # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. edit_to_be_written = toolbox.formatEdit(auto_edit) if (src_line_present == False): src_dict[src_line].append(edit_to_be_written + "\n") else: src_dict[src_line].append(edit_to_be_written[:-1] + str(src_lp_count) + "\n") # Finally write the source sentences(keys) and edits(values) to the m2 file for source_sent in src_dict.copy(): out_m2.write(source_sent) for k in range(len(src_dict[source_sent])): out_m2.write(src_dict[source_sent][k]) out_m2.write('\n')
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Punctuation normalisation dictionary norm_dict = {"’": "'", "´": "'", "‘": "'", "′": "'", "`": "'", '“': '"', '”': '"', '˝': '"', '¨': '"', '„': '"', '『': '"', '』': '"', '–': '-', '—': '-', '―': '-', '¬': '-', '、': ',', ',': ',', ':': ':', ';': ';', '?': '?', '!': '!', 'ِ': ' ', '\u200b': ' '} norm_dict = {ord(k): v for k, v in norm_dict.items()} # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Preprocessing files...") # Open the file with open(args.json_file) as data: # Process each line for line in data: # Load the JSON line line = json.loads(line) # Normalise certain punctuation in the text text = line["text"].translate(norm_dict) # Store the sentences and edits for all annotators here coder_dict = {} # Loop through the annotator ids and their edits for coder, edits in line["edits"]: # Add the coder to the coder_dict if needed if coder not in coder_dict: coder_dict[coder] = [] # Split the essay into paragraphs and update and normalise the char edits para_info = getParas(text, edits, norm_dict) # Loop through the paragraphs and edits for orig_para, para_edits in para_info: # Remove unnecessary whitespace from para and update char edits orig_para, para_edits = cleanPara(orig_para, para_edits) if not orig_para: continue # Ignore empty paras # Annotate orig_para with spacy orig_para = nlp(orig_para) # Convert character edits to token edits para_edits = getTokenEdits(orig_para, para_edits, nlp) # Split the paragraph into sentences and update tok edits sents = getSents(orig_para, para_edits) # Save the sents in the coder_dict coder_dict[coder].extend(sents) # Get the sorted coder ids coder_ids = sorted(coder_dict.keys()) # Loop through the sentences for the first coder for sent_id, sent in enumerate(coder_dict[0]): # Write the original sentence to the output M2 file out_m2.write("S "+" ".join(sent["orig"])+"\n") # Annotate the original sentence with spacy orig_sent = toolbox.applySpacy(sent["orig"], nlp) # Loop through the coders for coder in coder_ids: # Annotate the corrected sentence with spacy and get the gold edits cor_sent = toolbox.applySpacy(coder_dict[coder][sent_id]["cor"], nlp) gold_edits = coder_dict[coder][sent_id]["edits"] # Gold edits if args.gold: # Make sure edits are ordered in terms of start, then end offsets. gold_edits = sorted(gold_edits, key=itemgetter(0)) # Sort by start offset gold_edits = sorted(gold_edits, key=itemgetter(1)) # Sort by end offset min_edits = [] # Loop through the gold edits. for gold_edit in gold_edits: # Minimise correction (not detection D) edits: e.g. [has eaten -> eaten] = [has -> ε] if gold_edit[2] == "C": gold_edit = toolbox.minimiseEdit(gold_edit, orig_sent, cor_sent) # Classify and save non-empty edits if gold_edit: cat = cat_rules.autoTypeEdit(gold_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer) gold_edit[2] = cat min_edits.append(gold_edit) # If there are no minimised edits, write an explicit empty edit if not min_edits: out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n") # Otherwise loop through the edits and write them to the output m2 file. for edit in min_edits: out_m2.write(toolbox.formatEdit(edit, coder)+"\n") # Auto edits elif args.auto: # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits(orig_sent, cor_sent, nlp, args) # If there are no edits, write an explicit noop edit. if not auto_edits: out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n") # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n") # Write new line after each sentence when we reach last coder. out_m2.write("\n")
def main(args): # Get base working directory. basename = os.path.dirname(os.path.realpath(__file__)) print("Loading resources...") # Load Tokenizer and other resources nlp = spacy.load("en") # Lancaster Stemmer stemmer = LancasterStemmer() # GB English word list (inc -ise and -ize) gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt") # Part of speech map file tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map") # Setup output m2 file out_m2 = open(args.out, "w") print("Processing files...") # Open the m2 file and split into sentence+edit chunks. m2_file = open(args.m2).read().strip().split("\n\n") for info in m2_file: # Get the original and corrected sentence + edits for each annotator. orig_sent, coder_dict = toolbox.processM2(info) # Write the orig_sent to the output m2 file. out_m2.write("S "+" ".join(orig_sent)+"\n") # Markup the original sentence with spacy (assume tokenized) proc_orig = toolbox.applySpacy(orig_sent, nlp) # Loop through the annotators for coder, coder_info in sorted(coder_dict.items()): cor_sent = coder_info[0] gold_edits = coder_info[1] # Markup the corrected sentence with spacy (assume tokenized) proc_cor = toolbox.applySpacy(cor_sent, nlp) # Gold edits if args.gold: # Loop through the gold edits. for gold_edit in gold_edits: # Write noop edits to the output m2 file. if gold_edit[2] == "noop": out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") continue # Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was] if not args.max_edits: gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor) # If minimised to nothing, the edit disappears. if not gold_edit: continue # Give the edit an automatic error type. if not args.old_cats: cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) gold_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n") # Auto edits elif args.auto: # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args) # If there are no edits, write an explicit noop edit. if not auto_edits: out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n") # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n") # Write a newline when there are no more coders. out_m2.write("\n")
def get_preprocess_text(): if request.method == 'POST': in_str = request.form.get("sen_input") input_sen = in_str #"If your genetic results indicate that you have gene changes associated with an increased risk of heart disease , it does not mean that you definitely will develop heart disease ." words = input_sen.split() totals = [] candidate_words = 10 delset = string.punctuation for each_word in words: if each_word in delset: totals.append([each_word]) continue if spell_checker.check(each_word): #totals.append([each_word]) totals.append( spell_checker.suggest(each_word)[:candidate_words]) else: totals.append( spell_checker.suggest(each_word)[:candidate_words]) print(totals) cur = [] prev = [""] for i in range(len(totals)): for item in prev: for j in range(len(totals[i])): cur.append((item + ' ' + totals[i][j]).strip()) prev = cur cur = [] outputs, ori_scores = model_predict(prev, models, generator, align_dict, max_positions, args, use_cuda, task, src_dict, tgt_dict) score_dict = dict() for ind, output in enumerate(outputs): s0 = ori_scores[ind] s1 = [ float(item) for item in ed.get_score(input_sen, output).split() ] s2 = float(lm.get_score(input_sen, output)) s3 = float(wp.get_score(input_sen, output)) final_score = s0 * weights[0] + s1[0] * weights[1] + s1[ 1] * weights[2] + s1[2] * weights[3] + s2 * weights[ 4] + s2 * weights[5] score_dict[ind] = final_score print(s0, s1[0], s1[1], s1[2], s2, s3) sorted_indices = sorted(score_dict, key=score_dict.get, reverse=True) out_type = [] for ind in sorted_indices: proc_orig = toolbox.applySpacy(input_sen.split(), nlp) output_type = '\n' cor_sent = outputs[ind] cor_sent = cor_sent.strip() # Identical sentences have no edits, so just write noop. if input_sen == cor_sent: output_type += "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||" + "\n" # Otherwise, do extra processing. else: # Markup the corrected sentence with spacy (assume tokenized) proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp) # Auto align the parallel sentences and extract the edits. auto_edits = align_text.getAutoAlignedEdits( proc_orig, proc_cor, nlp, True, 'rules') # Loop through the edits. for auto_edit in auto_edits: # Give each edit an automatic error type. cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer) auto_edit[2] = cat # Write the edit to the output m2 file. output_type += toolbox.formatEdit(auto_edit, 0) + "\n" out_type.append(output_type) print(outputs[ind]) couplet_res = outputs[sorted_indices[0]] + out_type[0] sys.stdout.flush() return render_template('show.html', sen_input=input_sen, sen_res=couplet_res) else: return render_template('index.html')