def char_level_file(input_file, amr_ext, sent_ext, pos, super_chars, coreference): '''Given an input file, put it in char-level format and write output''' if input_file.endswith(amr_ext): # File ends with AMR extension, do AMR char-level processing out_f = input_file.replace(amr_ext, '.char' + amr_ext) if super_chars: # Super characters get a different treatment print('AMR file, super characters') amr_lines = get_amr_lines(input_file) fixed_lines = get_fixed_lines(amr_lines, coreference) write_to_file(fixed_lines, out_f) else: print('AMR file, no super characters') # If there are no super character we can just process with sed os_call = 'sed -e "s/\ /+/g" -e "s/./&\ /g" < {0} > {1}'.format(input_file, out_f) os.system(os_call) elif input_file.endswith(sent_ext): # File ends with sent ext, do sentence processing out_f = input_file.replace(sent_ext, '.char' + sent_ext) if pos: # POS-tagged files get a different treatment print('Sentence file, POS-tagged') lines = process_pos_tagged(input_file) write_to_file(lines, out_f) else: # Not POS-tagged, so we can just use sed print('Sentence file, not POS-tagged') os_call = 'sed -e "s/\ /+/g" -e "s/./&\ /g" < {0} > {1}'.format(input_file, out_f) os.system(os_call)
def restore_variables(input_file, filtered_amrs): '''Restore the removed variables for the pruned file''' # Write variable-less AMRs to file write_to_file(filtered_amrs, input_file + '.pruned_temp') # Then restore the AMR os.system('python3 restoreAMR/restore_amr.py -f {0} -o {1}'.format(input_file + '.pruned_temp', input_file + '.pruned')) # Remove temp file again os.system("rm {0}".format(input_file + '.pruned_temp'))
def create_coref_indexing(input_file, output_ext, keep_wiki): '''Go from full AMR to one-line AMR without wiki with coreference indexed''' # Remove all Wiki instances amr_file_no_wiki = delete_wiki(input_file) if not keep_wiki else [x.rstrip() for x in open(input_file, 'r')] # Put everything on a single line, sent_file is empty single_amrs, _ = single_line_convert(amr_file_no_wiki, '') # Add the coference index we want repl_amrs = coreference_index(single_amrs) # Write output to file write_to_file(repl_amrs, input_file + output_ext)
def var_free_amrs(input_file, out_ext, keep_wiki): '''Create variable-free AMRs and sentence files''' # Delete wiki link if wanted amr_no_wiki = delete_wiki(input_file) if not keep_wiki else [ x.rstrip() for x in open(input_file, 'r') ] # Remove all variables by duplicating coreference nodes del_amrs = delete_amr_variables(amr_no_wiki) # Put AMR on single line and write output single_amrs, _ = single_line_convert(del_amrs, '') write_to_file(single_amrs, input_file + out_ext)
def create_output(input_file, old_amrs, new_amrs, sent_amrs, double, amr_ext): '''Print output to the correct files - also keep no-var AMR''' permuted_amr, no_var_amr, sent_file, double_sent_file, double_amr_file = get_filenames( input_file, amr_ext) write_to_file(old_amrs, no_var_amr) write_to_file(new_amrs, permuted_amr) write_to_file(sent_amrs, sent_file) # Potentially we want to keep BOTH the original AMR and the best-permuted AMR if double: write_to_file(old_amrs + new_amrs, double_amr_file) write_to_file(sent_amrs + sent_amrs, double_sent_file)
for idx in range(len(var_list) - 1): for y in range(idx + 1, len(var_list)): # Match - we see a concept (var-value) we already saw before if var_list[idx][1] == var_list[y][1]: replace_item = var_list[y][0] + ' / ' + var_list[y][1] # The part that needs to be replaced should be present if replace_item in line: # Do the actual replacing here, e.g. replace :ARG1 (var / value) by :ARG refvar new_line_replaced = re.sub( r'\({0} / [^\(]*?\)'.format(var_list[y][0]), ' ' + var_list[idx][0], new_line) # Only do replacing if resulting AMR is valid if new_line_replaced != new_line and valid_amr( new_line_replaced): new_line = new_line_replaced # Perhaps fix some weird tokenization issues new_line = new_line.replace('_ (', '_(').replace(') "', ')"') coref_amrs.append(new_line.strip()) # Sanity check assert len(coref_amrs) == indx + 1 return coref_amrs if __name__ == '__main__': args = create_arg_parser() # Do main processing here coref_amrs = process_file(args.input_file) # Write results to output file write_to_file(coref_amrs, args.input_file + args.output_ext)
try: # Variable coming up if tokenized_line[count+3] == '/': amr_string.append('\n' + num_tabs * '\t' + part) # Variable coming, add newline here elif variable_match(tokenized_line[count+1]): amr_string.append('\n' + num_tabs * '\t' + part) else: amr_string.append(part) except: amr_string.append(part) else: amr_string.append(part) original_line = reverse_tokenize(" ".join(amr_string)) original_line = original_line.replace('_ (', '_(').replace(') "', ')"') fixed_amrs.append(original_line + '\n\n') return fixed_amrs if __name__ == "__main__": args = create_arg_parser() fixed_amrs = reformat_amr(args.input_file) # Check if AMRs are valid, error if they're not if args.valid: for amr in fixed_amrs: if not valid_amr(amr): raise ValueError(amr) write_to_file(fixed_amrs, args.input_file + args.extension, extra_newline=True)
# Extra step to make sure digits are not added to arguments line = add_space_when_digit(line) # Restore variables here, also fix problems afterwards if there are any line = convert(line) # The digit problem might reoccur again here line = add_space_when_digit(line) # We did some hacky rewrites to make sure convert() didn't mess anything up # restore them in this step (polarity +, polite, etc) line = restore_rewrites(line) # Finally restore the coreference if args.coreference == 'index': # Replace the 'coref-' nodes with the reference line = add_coref(line) elif args.coreference == 'abs': # Replace absolute paths with reference here line = replace_absolute_paths(line, ref_dict) # Save the final line restored_lines.append(" ".join(line.strip().split())) # Print detailed results for the coreference methods if args.print_stats: print_coref_stats(args.coreference, replace_types, index_dict) # Write final output to file write_to_file(restored_lines, args.output_file)