def parse(self, file_path): """It parses the content of file_path and extracts relevant information from a TempEval-3 annotated file. Those information are packed in a Document object, which is our internal representation. """ assert os.path.isfile(file_path), 'File path does not exist!' logging.info('Document {}: parsing...'.format( os.path.relpath(file_path))) xml = etree.parse(file_path) text_node = xml.findall(".//TEXT")[0] text_string = etree.tostring(text_node, method='text', encoding='utf8') text_xml = etree.tostring(text_node, method='xml', encoding='utf8') text_string = unicode(text_string, 'UTF-8') text_xml = unicode(text_xml, 'UTF-8') right_chars = len(text_xml.split('</TEXT>')[1]) text_string = text_string[:-right_chars] text_xml = etree.tostring(text_node) # StanfordParser strips internally the text :( left_chars = len(text_string) - len(text_string.lstrip()) with Mute_stderr(): stanford_tree = CORENLP.parse(text_string) document = Document(file_path) document.text_offset = left_chars document.file_path = os.path.abspath(file_path) document.doc_id = os.path.basename(file_path) document.sec_times = self.get_dct(file_path) document.dct = document.sec_times.admission_date document.dct_text = document.dct.replace('-', '') document.title = os.path.basename(file_path) document.text = text_string document._coref = stanford_tree.get('coref', []) for num_sen, stanford_sentence in\ enumerate(stanford_tree['sentences']): collp_deps = stanford_sentence.get('collapsed_dependencies', None) basic_deps = stanford_sentence.get('basic_dependencies', None) parsetree = stanford_sentence.get('parsetree', u'') sentence_text = stanford_sentence.get('text', u'') sentence = Sentence(id_sentence=num_sen, basic_dependencies=basic_deps, collapsed_dependencies=collp_deps, parsetree=parsetree, text=sentence_text) for num_word, (word_form, attr) in\ enumerate(stanford_sentence['words']): offset_begin = int(attr['CharacterOffsetBegin']) - left_chars offset_end = int(attr['CharacterOffsetEnd']) - left_chars word = Word(word_form=word_form, char_offset_begin=offset_begin, char_offset_end=offset_end, lemma=attr['Lemma'], named_entity_tag=attr['NamedEntityTag'], part_of_speech=attr['PartOfSpeech'], id_token=num_word, id_sentence=num_sen) sentence.words.append(word) document.sentences.append(sentence) document.gold_annotations = self._get_annotations( xml, document) document.store_gold_annotations() document.complete_structure() logging.info('Document {}: parsed.'.format(os.path.relpath(file_path))) return document
def parse(self, file_path): """It parses the content of file_path and extracts relevant information from a TempEval-3 annotated file. Those information are packed in a Document object, which is our internal representation. """ assert os.path.isfile(file_path), 'File path does not exist!' logging.info('Document {}: parsing...'.format( os.path.relpath(file_path))) xml = etree.parse(file_path) text_node = xml.findall(".//TEXT")[0] text_string = etree.tostring(text_node, method='text', encoding='utf8') text_xml = etree.tostring(text_node, method='xml', encoding='utf8') text_string = unicode(text_string, 'UTF-8') text_xml = unicode(text_xml, 'UTF-8') right_chars = len(text_xml.split('</TEXT>')[1]) text_string = text_string[:-right_chars] text_xml = etree.tostring(text_node) # StanfordParser strips internally the text :( left_chars = len(text_string) - len(text_string.lstrip()) with Mute_stderr(): stanford_tree = CORENLP.parse(text_string) document = Document(file_path) document.text_offset = left_chars document.file_path = os.path.abspath(file_path) document.doc_id = os.path.basename(file_path) document.sec_times = self.get_dct(file_path) document.dct = document.sec_times.admission_date document.dct_text = document.dct.replace('-', '') document.title = os.path.basename(file_path) document.text = text_string document._coref = stanford_tree.get('coref', []) for num_sen, stanford_sentence in\ enumerate(stanford_tree['sentences']): collp_deps = stanford_sentence.get('collapsed_dependencies', None) basic_deps = stanford_sentence.get('basic_dependencies', None) parsetree = stanford_sentence.get('parsetree', u'') sentence_text = stanford_sentence.get('text', u'') sentence = Sentence(id_sentence=num_sen, basic_dependencies=basic_deps, collapsed_dependencies=collp_deps, parsetree=parsetree, text=sentence_text) for num_word, (word_form, attr) in\ enumerate(stanford_sentence['words']): offset_begin = int(attr['CharacterOffsetBegin']) - left_chars offset_end = int(attr['CharacterOffsetEnd']) - left_chars word = Word(word_form=word_form, char_offset_begin=offset_begin, char_offset_end=offset_end, lemma=attr['Lemma'], named_entity_tag=attr['NamedEntityTag'], part_of_speech=attr['PartOfSpeech'], id_token=num_word, id_sentence=num_sen) sentence.words.append(word) document.sentences.append(sentence) document.gold_annotations = self._get_annotations(xml, document) document.store_gold_annotations() document.complete_structure() logging.info('Document {}: parsed.'.format(os.path.relpath(file_path))) return document