def write_freqt(docgraph, output_filepath, include_pos=False): """convert a docgraph into a FREQT input file (one sentence per line).""" path_to_file = os.path.dirname(output_filepath) if not os.path.isdir(path_to_file): create_dir(path_to_file) with codecs.open(output_filepath, 'w', 'utf-8') as output_file: for sentence in docgraph.sentences: output_file.write(docgraph2freqt(docgraph, sentence, include_pos=include_pos)+'\n')
def write_freqt(docgraph, output_filepath, include_pos=False): """convert a docgraph into a FREQT input file (one sentence per line).""" path_to_file = os.path.dirname(output_filepath) if not os.path.isdir(path_to_file): create_dir(path_to_file) with codecs.open(output_filepath, 'w', 'utf-8') as output_file: for sentence in docgraph.sentences: output_file.write( docgraph2freqt(docgraph, sentence, include_pos=include_pos) + '\n')
def write_exb(docgraph, output_file): """ converts a DiscourseDocumentGraph into an Exmaralda ``*.exb`` file and writes it to the given file (or file path). """ exmaralda_file = ExmaraldaFile(docgraph) assert isinstance(output_file, (str, file)) if isinstance(output_file, str): path_to_file = os.path.dirname(output_file) if not os.path.isdir(path_to_file): create_dir(path_to_file) exmaralda_file.write(output_file) else: # output_file is a file object output_file.write(exmaralda_file.__str__())
def write_brat(pocores, output_dir): create_dir(output_dir) doc_name = os.path.basename(pocores.document.name) with codecs.open(os.path.join(output_dir, doc_name + '.txt'), 'wb', encoding='utf-8') as txtfile: txtfile.write(dg.get_text(pocores.document)) with codecs.open(os.path.join(output_dir, 'annotation.conf'), 'wb', encoding='utf-8') as annotation_conf: annotation_conf.write(create_annotation_conf(pocores)) with codecs.open(os.path.join(output_dir, 'visual.conf'), 'wb', encoding='utf-8') as visual_conf: visual_conf.write(create_visual_conf(pocores)) with codecs.open(os.path.join(output_dir, doc_name + '.ann'), 'wb', encoding='utf-8') as annfile: annfile.write(brat_output(pocores))
def write_conll(docgraph, output_file, coreference_layer=None, markable_layer=None): """ converts a DiscourseDocumentGraph into a tab-separated CoNLL 2009 file and writes it to the given file (or file path). """ if markable_layer is None: markable_layer = docgraph.ns+':markable' conll_file = Conll2009File(docgraph, coreference_layer=coreference_layer, markable_layer=markable_layer) assert isinstance(output_file, (str, file)) if isinstance(output_file, str): path_to_file = os.path.dirname(output_file) if not os.path.isdir(path_to_file): create_dir(path_to_file) conll_file.write(output_file) else: # output_file is a file object output_file.write(conll_file.__str__())
def write_paula(docgraph, output_root_dir, human_readable=False): """ converts a DiscourseDocumentGraph into a set of PAULA XML files representing the same document. Parameters ---------- docgraph : DiscourseDocumentGraph the document graph to be converted """ paula_document = PaulaDocument(docgraph, human_readable=human_readable) error_msg = ("Please specify an output directory.\nPaula documents consist" " of multiple files, so we can't just pipe them to STDOUT.") assert isinstance(output_root_dir, str), error_msg document_dir = os.path.join(output_root_dir, paula_document.name) if not os.path.isdir(document_dir): create_dir(document_dir) for paula_id in paula_document.files: with open(os.path.join(document_dir, paula_id+'.xml'), 'w') as outfile: outfile.write( paula_etree_to_string(paula_document.files[paula_id], paula_document.file2dtd[paula_id]))
def merging_cli(debug=False): """ simple commandline interface of the merging module. This function is called when you use the ``discoursegraphs`` application directly on the command line. """ parser = argparse.ArgumentParser() parser.add_argument('-t', '--tiger-file', help='TigerXML (syntax) file to be merged') parser.add_argument('-r', '--rst-file', help='RS3 (rhetorical structure) file to be merged') parser.add_argument('-a', '--anaphoricity-file', help='anaphoricity file to be merged') parser.add_argument('-c', '--conano-file', help='conano file to be merged') parser.add_argument('-m', '--mmax-file', help='MMAX2 file to be merged') parser.add_argument( '-o', '--output-format', default='dot', help=('output format: brackets, brat, dot, pickle, geoff, gexf, graphml, ' 'neo4j, exmaralda, conll, paula, no-output')) parser.add_argument('output_file', nargs='?', default=sys.stdout) args = parser.parse_args(sys.argv[1:]) for filepath in (args.tiger_file, args.rst_file, args.anaphoricity_file, args.conano_file): if filepath: # if it was specified on the command line assert os.path.isfile(filepath), \ "File '{}' doesn't exist".format(filepath) # create an empty document graph. merge it with other graphs later on. discourse_docgraph = DiscourseDocumentGraph() if args.tiger_file: from discoursegraphs.readwrite.tiger import TigerDocumentGraph tiger_docgraph = TigerDocumentGraph(args.tiger_file) discourse_docgraph.merge_graphs(tiger_docgraph) if args.rst_file: rst_graph = dg.read_rs3(args.rst_file) discourse_docgraph.merge_graphs(rst_graph) if args.anaphoricity_file: from discoursegraphs.readwrite import AnaphoraDocumentGraph anaphora_graph = AnaphoraDocumentGraph(args.anaphoricity_file) discourse_docgraph.merge_graphs(anaphora_graph) # the anaphora doc graph only contains trivial edges from its root # node. try: discourse_docgraph.remove_node('anaphoricity:root_node') except networkx.NetworkXError as e: # ignore if the node doesn't exist pass if args.conano_file: from discoursegraphs.readwrite import ConanoDocumentGraph conano_graph = ConanoDocumentGraph(args.conano_file) discourse_docgraph.merge_graphs(conano_graph) if args.mmax_file: from discoursegraphs.readwrite import MMAXDocumentGraph mmax_graph = MMAXDocumentGraph(args.mmax_file) discourse_docgraph.merge_graphs(mmax_graph) if isinstance(args.output_file, str): # if we're not piping to stdout ... # we need abspath to handle files in the current directory path_to_output_file = \ os.path.dirname(os.path.abspath(args.output_file)) if not os.path.isdir(path_to_output_file): create_dir(path_to_output_file) if args.output_format == 'dot': write_dot(discourse_docgraph, args.output_file) elif args.output_format == 'brat': dg.write_brat(discourse_docgraph, args.output_file) elif args.output_format == 'brackets': dg.write_brackets(discourse_docgraph, args.output_file) elif args.output_format == 'pickle': import cPickle as pickle with open(args.output_file, 'wb') as pickle_file: pickle.dump(discourse_docgraph, pickle_file) elif args.output_format in ('geoff', 'neo4j'): from discoursegraphs.readwrite.neo4j import write_geoff write_geoff(discourse_docgraph, args.output_file) print '' # this is just cosmetic for stdout elif args.output_format == 'gexf': dg.write_gexf(discourse_docgraph, args.output_file) elif args.output_format == 'graphml': dg.write_graphml(discourse_docgraph, args.output_file) elif args.output_format == 'exmaralda': from discoursegraphs.readwrite.exmaralda import write_exb write_exb(discourse_docgraph, args.output_file) elif args.output_format == 'conll': from discoursegraphs.readwrite.conll import write_conll write_conll(discourse_docgraph, args.output_file) elif args.output_format == 'paula': from discoursegraphs.readwrite.paulaxml.paula import write_paula write_paula(discourse_docgraph, args.output_file) elif args.output_format == 'no-output': pass # just testing if the merging works else: raise ValueError( "Unsupported output format: {}".format(args.output_format)) if debug: print "Merged successfully: ", args.tiger_file
def merging_cli(debug=False): """ simple commandline interface of the merging module. This function is called when you use the ``discoursegraphs`` application directly on the command line. """ parser = argparse.ArgumentParser() parser.add_argument('-t', '--tiger-file', help='TigerXML (syntax) file to be merged') parser.add_argument('-r', '--rst-file', help='RS3 (rhetorical structure) file to be merged') parser.add_argument('-a', '--anaphoricity-file', help='anaphoricity file to be merged') parser.add_argument('-c', '--conano-file', help='conano file to be merged') parser.add_argument('-m', '--mmax-file', help='MMAX2 file to be merged') parser.add_argument( '-o', '--output-format', default='dot', help=( 'output format: brackets, brat, dot, pickle, geoff, gexf, graphml, ' 'neo4j, exmaralda, conll, paula, no-output')) parser.add_argument('output_file', nargs='?', default=sys.stdout) args = parser.parse_args(sys.argv[1:]) for filepath in (args.tiger_file, args.rst_file, args.anaphoricity_file, args.conano_file): if filepath: # if it was specified on the command line assert os.path.isfile(filepath), \ "File '{}' doesn't exist".format(filepath) # create an empty document graph. merge it with other graphs later on. discourse_docgraph = DiscourseDocumentGraph() if args.tiger_file: from discoursegraphs.readwrite.tiger import TigerDocumentGraph tiger_docgraph = TigerDocumentGraph(args.tiger_file) discourse_docgraph.merge_graphs(tiger_docgraph) if args.rst_file: rst_graph = dg.read_rs3(args.rst_file) discourse_docgraph.merge_graphs(rst_graph) if args.anaphoricity_file: from discoursegraphs.readwrite import AnaphoraDocumentGraph anaphora_graph = AnaphoraDocumentGraph(args.anaphoricity_file) discourse_docgraph.merge_graphs(anaphora_graph) # the anaphora doc graph only contains trivial edges from its root # node. try: discourse_docgraph.remove_node('anaphoricity:root_node') except networkx.NetworkXError as e: # ignore if the node doesn't exist pass if args.conano_file: from discoursegraphs.readwrite import ConanoDocumentGraph conano_graph = ConanoDocumentGraph(args.conano_file) discourse_docgraph.merge_graphs(conano_graph) if args.mmax_file: from discoursegraphs.readwrite import MMAXDocumentGraph mmax_graph = MMAXDocumentGraph(args.mmax_file) discourse_docgraph.merge_graphs(mmax_graph) if isinstance(args.output_file, str): # if we're not piping to stdout ... # we need abspath to handle files in the current directory path_to_output_file = \ os.path.dirname(os.path.abspath(args.output_file)) if not os.path.isdir(path_to_output_file): create_dir(path_to_output_file) if args.output_format == 'dot': write_dot(discourse_docgraph, args.output_file) elif args.output_format == 'brat': dg.write_brat(discourse_docgraph, args.output_file) elif args.output_format == 'brackets': dg.write_brackets(discourse_docgraph, args.output_file) elif args.output_format == 'pickle': import cPickle as pickle with open(args.output_file, 'wb') as pickle_file: pickle.dump(discourse_docgraph, pickle_file) elif args.output_format in ('geoff', 'neo4j'): from discoursegraphs.readwrite.neo4j import write_geoff write_geoff(discourse_docgraph, args.output_file) print '' # this is just cosmetic for stdout elif args.output_format == 'gexf': dg.write_gexf(discourse_docgraph, args.output_file) elif args.output_format == 'graphml': dg.write_graphml(discourse_docgraph, args.output_file) elif args.output_format == 'exmaralda': from discoursegraphs.readwrite.exmaralda import write_exb write_exb(discourse_docgraph, args.output_file) elif args.output_format == 'conll': from discoursegraphs.readwrite.conll import write_conll write_conll(discourse_docgraph, args.output_file) elif args.output_format == 'paula': from discoursegraphs.readwrite.paulaxml.paula import write_paula write_paula(discourse_docgraph, args.output_file) elif args.output_format == 'no-output': pass # just testing if the merging works else: raise ValueError("Unsupported output format: {}".format( args.output_format)) if debug: print "Merged successfully: ", args.tiger_file
def run_pocores(input_file, input_format, output_dest=None, output_format='bracketed', weights=WEIGHTS, max_sent_dist=MAX_SENT_DIST, debug=False, eval_file=None): """ run the pocores coreference system on a mate-parsed, CoNLL-formatted input file. """ assert input_format in ('2009', '2010') assert output_format in ('bracketed', 'brat', 'xml') if input_format == '2009': docgraph = dg.read_conll(input_file, conll_format=input_format, deprel_attr='pdeprel', feat_attr='pfeat', head_attr='phead', lemma_attr='plemma', pos_attr='ppos') else: # conll 2010 format docgraph = dg.read_conll(input_file, conll_format=input_format, deprel_attr='pdeprel', feat_attr='pfeat', head_attr='phead', lemma_attr='lemma', pos_attr='ppos') pocores = Pocores(docgraph) pocores.resolve_anaphora(weights, max_sent_dist, debug=debug) pocores.add_coreference_chains_to_docgraph() if output_format == 'bracketed': if isinstance(output_dest, file): output_dest.write(output_with_brackets(pocores)) else: path_to_dir, _filename = os.path.split(output_dest) create_dir(path_to_dir) with codecs.open(output_dest, 'w', 'utf-8') as output_file: output_file.write(output_with_brackets(pocores)) elif output_format == 'xml': if isinstance(output_dest, file): output_dest.write(make_xml(pocores)) else: path_to_dir, _filename = os.path.split(output_dest) create_dir(path_to_dir) with codecs.open(output_dest, 'w', 'utf-8') as output_file: output_file.write(make_xml(pocores)) else: # 'brat' if not isinstance(output_dest, file): # output_dest will be treated as a directory write_brat(pocores, output_dest) else: sys.stderr.write('For brat output specify an output folder.\n') sys.exit(1) if debug: print_coreference_report(pocores) if eval_file: # TODO: implement proper scorer.pl-based evaluation # there's some useful code in the /var/local/git/Depot/coreference.git # repo on hebe raise NotImplementedError return pocores