Beispiel #1
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    # ExitStack lets us process an arbitrary number of files line by line simultaneously.
    # See https://stackoverflow.com/questions/24108769/how-to-read-and-process-multiple-files-simultaneously-in-python
    print("Processing files...")
    with ExitStack() as stack:
        in_files = [
            stack.enter_context(open(i)) for i in [args.orig] + args.cor
        ]
        # Process each line of all input files.
        for line_id, line in enumerate(zip(*in_files)):
            orig_sent = line[0].strip()
            cor_sents = line[1:]
            # If orig sent is empty, skip the line
            if not orig_sent: continue
            # Write the original sentence to the output m2 file.
            out_m2.write("S " + orig_sent + "\n")
            # Markup the original sentence with spacy (assume tokenized)
            proc_orig = toolbox.applySpacy(orig_sent.split(), nlp)
            # Loop through the corrected sentences
            for cor_id, cor_sent in enumerate(cor_sents):
                cor_sent = cor_sent.strip()
                # Identical sentences have no edits, so just write noop.
                if orig_sent == cor_sent:
                    out_m2.write(
                        "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||" +
                        str(cor_id) + "\n")
                # Otherwise, do extra processing.
                else:
                    # Markup the corrected sentence with spacy (assume tokenized)
                    proc_cor = toolbox.applySpacy(cor_sent.strip().split(),
                                                  nlp)
                    # Auto align the parallel sentences and extract the edits.
                    auto_edits = align_text.getAutoAlignedEdits(
                        proc_orig, proc_cor, nlp, args)
                    # Loop through the edits.
                    for auto_edit in auto_edits:
                        # Give each edit an automatic error type.
                        cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                     proc_cor, gb_spell,
                                                     tag_map, nlp, stemmer)
                        auto_edit[2] = cat
                        # Write the edit to the output m2 file.
                        out_m2.write(
                            toolbox.formatEdit(auto_edit, cor_id) + "\n")
            # Write a newline when we have processed all corrections for a given sentence.
            out_m2.write("\n")
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    print("Processing files...")
    # Open the original and corrected text files.
    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in zip(orig, cor):
            # Write the original sentence to the output m2 file.
            out_m2.write("S " + orig_sent)
            # Identical sentences have no edits, so just write noop.
            if orig_sent.strip() == cor_sent.strip():
                out_m2.write(
                    "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
            # Otherwise, do extra processing.
            else:
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp)
                # Auto align the parallel sentences and extract the edits.
                auto_edits = align_text.getAutoAlignedEdits(
                    proc_orig, proc_cor, nlp, args)
                # Loop through the edits.
                for auto_edit in auto_edits:
                    # Give each edit an automatic error type.
                    cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                 proc_cor, gb_spell, tag_map,
                                                 nlp, stemmer)
                    auto_edit[2] = cat
                    # Write the edit to the output m2 file.
                    out_m2.write(toolbox.formatEdit(auto_edit) + "\n")
            # Write a newline when there are no more edits.
            out_m2.write("\n")
Beispiel #3
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    print("Processing files...")
    # Open the original and corrected text files.
    src_dict = defaultdict(
        list
    )  # Dict to store Source sentences as keys and edits as values , including multiple annotators for the same source sentence.
    src_line_present = False

    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in zip(orig, cor):
            # Write the original sentence to the output m2 file.
            #out_m2.write("S "+orig_sent)
            src_sent = "S " + orig_sent
            if src_sent not in src_dict.keys():
                src_dict[src_sent] = []
                src_line_present = False  # Boolean variable to check if source sentence already present in dictionary
                src_line = src_sent
                src_lp_count = 0  # Variable to store how many times source line is already present.
                #src_lp_count also keeps track of annotator IDs to be written to m2 file.
            else:
                src_line_present = True
                src_line = src_sent
                src_lp_count += 1
            # Identical sentences have no edits, so just write noop.
            if orig_sent.strip() == cor_sent.strip():
                #out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
                src_dict[src_sent].append(
                    "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
            # Otherwise, do extra processing.
            else:
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp)
                # Auto align the parallel sentences and extract the edits.
                auto_edits = align_text.getAutoAlignedEdits(
                    proc_orig, proc_cor, nlp, args)
                # Loop through the edits.
                for auto_edit in auto_edits:
                    # Give each edit an automatic error type.
                    cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                 proc_cor, gb_spell, tag_map,
                                                 nlp, stemmer)
                    auto_edit[2] = cat
                    # Write the edit to the output m2 file.
                    edit_to_be_written = toolbox.formatEdit(auto_edit)
                    if (src_line_present == False):
                        src_dict[src_line].append(edit_to_be_written + "\n")
                    else:
                        src_dict[src_line].append(edit_to_be_written[:-1] +
                                                  str(src_lp_count) + "\n")
    # Finally write the source sentences(keys) and edits(values) to the m2 file
    for source_sent in src_dict.copy():
        out_m2.write(source_sent)
        for k in range(len(src_dict[source_sent])):
            out_m2.write(src_dict[source_sent][k])
        out_m2.write('\n')
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Punctuation normalisation dictionary
	norm_dict = {"’": "'",
				 "´": "'",
				 "‘": "'",
				 "′": "'",
				 "`": "'",
				 '“': '"',
				 '”': '"',
				 '˝': '"',
				 '¨': '"',
				 '„': '"',
				 '『': '"',
				 '』': '"',
				 '–': '-',
				 '—': '-',
				 '―': '-',
				 '¬': '-',
				 '、': ',',
				 ',': ',',
				 ':': ':',
				 ';': ';',
				 '?': '?',
				 '!': '!',
				 'ِ': ' ',
				 '\u200b': ' '}
	norm_dict = {ord(k): v for k, v in norm_dict.items()}
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Preprocessing files...")
	# Open the file
	with open(args.json_file) as data:
		# Process each line
		for line in data:
			# Load the JSON line
			line = json.loads(line)
			# Normalise certain punctuation in the text
			text = line["text"].translate(norm_dict)
			# Store the sentences and edits for all annotators here
			coder_dict = {}
			# Loop through the annotator ids and their edits
			for coder, edits in line["edits"]:
				# Add the coder to the coder_dict if needed
				if coder not in coder_dict: coder_dict[coder] = []
				# Split the essay into paragraphs and update and normalise the char edits
				para_info = getParas(text, edits, norm_dict)
				# Loop through the paragraphs and edits
				for orig_para, para_edits in para_info:
					# Remove unnecessary whitespace from para and update char edits
					orig_para, para_edits = cleanPara(orig_para, para_edits)
					if not orig_para: continue # Ignore empty paras
					# Annotate orig_para with spacy
					orig_para = nlp(orig_para)
					# Convert character edits to token edits
					para_edits = getTokenEdits(orig_para, para_edits, nlp)
					# Split the paragraph into sentences and update tok edits
					sents = getSents(orig_para, para_edits)
					# Save the sents in the coder_dict
					coder_dict[coder].extend(sents)
			# Get the sorted coder ids
			coder_ids = sorted(coder_dict.keys())
			# Loop through the sentences for the first coder
			for sent_id, sent in enumerate(coder_dict[0]):
				# Write the original sentence to the output M2 file
				out_m2.write("S "+" ".join(sent["orig"])+"\n")
				# Annotate the original sentence with spacy
				orig_sent = toolbox.applySpacy(sent["orig"], nlp)
				# Loop through the coders
				for coder in coder_ids:
					# Annotate the corrected sentence with spacy and get the gold edits
					cor_sent = toolbox.applySpacy(coder_dict[coder][sent_id]["cor"], nlp)
					gold_edits = coder_dict[coder][sent_id]["edits"]
					# Gold edits
					if args.gold:
						# Make sure edits are ordered in terms of start, then end offsets.
						gold_edits = sorted(gold_edits, key=itemgetter(0)) # Sort by start offset
						gold_edits = sorted(gold_edits, key=itemgetter(1)) # Sort by end offset
						min_edits = []
						# Loop through the gold edits.
						for gold_edit in gold_edits:
							# Minimise correction (not detection D) edits: e.g. [has eaten -> eaten] = [has -> ε]
							if gold_edit[2] == "C": gold_edit = toolbox.minimiseEdit(gold_edit, orig_sent, cor_sent)
							# Classify and save non-empty edits
							if gold_edit:
								cat = cat_rules.autoTypeEdit(gold_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer)
								gold_edit[2] = cat
								min_edits.append(gold_edit)
						# If there are no minimised edits, write an explicit empty edit
						if not min_edits:
							out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
						# Otherwise loop through the edits and write them to the output m2 file.
						for edit in min_edits: out_m2.write(toolbox.formatEdit(edit, coder)+"\n")
					# Auto edits
					elif args.auto:
						# Auto align the parallel sentences and extract the edits.
						auto_edits = align_text.getAutoAlignedEdits(orig_sent, cor_sent, nlp, args)
						# If there are no edits, write an explicit noop edit.
						if not auto_edits:
							out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
						# Loop through the edits.
						for auto_edit in auto_edits:
							# Give each edit an automatic error type.
							cat = cat_rules.autoTypeEdit(auto_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer)
							auto_edit[2] = cat
							# Write the edit to the output m2 file.
							out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
				# Write new line after each sentence when we reach last coder.
				out_m2.write("\n")
Beispiel #5
0
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")	
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Processing files...")
	# Open the m2 file and split into sentence+edit chunks.
	m2_file = open(args.m2).read().strip().split("\n\n")
	for info in m2_file:
		# Get the original and corrected sentence + edits for each annotator.
		orig_sent, coder_dict = toolbox.processM2(info)
		# Write the orig_sent to the output m2 file.
		out_m2.write("S "+" ".join(orig_sent)+"\n")
		# Markup the original sentence with spacy (assume tokenized)
		proc_orig = toolbox.applySpacy(orig_sent, nlp)
		# Loop through the annotators
		for coder, coder_info in sorted(coder_dict.items()):
			cor_sent = coder_info[0]
			gold_edits = coder_info[1]
			# Markup the corrected sentence with spacy (assume tokenized)
			proc_cor = toolbox.applySpacy(cor_sent, nlp)
			# Gold edits
			if args.gold:
				# Loop through the gold edits.
				for gold_edit in gold_edits:
					# Write noop edits to the output m2 file.
					if gold_edit[2] == "noop":
						out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
						continue
					# Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was]
					if not args.max_edits:
						gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor)
						# If minimised to nothing, the edit disappears.
						if not gold_edit: continue
					# Give the edit an automatic error type.
					if not args.old_cats:
						cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
						gold_edit[2] = cat
					# Write the edit to the output m2 file.
					out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
			# Auto edits
			elif args.auto:
				# Auto align the parallel sentences and extract the edits.
				auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args)				
				# If there are no edits, write an explicit noop edit.
				if not auto_edits:
					out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
				# Loop through the edits.
				for auto_edit in auto_edits:
					# Give each edit an automatic error type.
					cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
					auto_edit[2] = cat
					# Write the edit to the output m2 file.
					out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
		# Write a newline when there are no more coders.
		out_m2.write("\n")