def read_corenlp_doc(filename, verbose=True): if verbose: log.info('Reading CoreNLP document from {}'.format(filename)) input_xml = smart_file_handler(filename) xml_parser = etree.XMLParser(target=CoreNLPTarget()) sents, corefs = etree.parse(input_xml, xml_parser) doc_name = splitext(basename(filename))[0] doc = Document.construct(doc_name, sents, corefs) input_xml.close() return doc
def read_doc_from_ontonotes(coref_doc, name_doc, verbose=True): doc_id = coref_doc.document_id.split('@')[0] assert doc_id == name_doc.document_id.split('@')[0], \ '{} and {} do not have the same document_id'.format(coref_doc, name_doc) if verbose: log.info('Reading ontonotes document {}'.format(doc_id)) conll_file_path = join(ontonotes_annotations_source, doc_id + '.depparse') all_sents = read_conll_depparse(conll_file_path) all_corefs = read_coref_doc(coref_doc) doc_name = doc_id.split('/')[-1] doc = Document.construct(doc_name, all_sents, all_corefs) for name_entity in read_name_doc(name_doc): add_name_entity_to_doc(doc, name_entity) return doc