Exemple #1
0
    def parse(self, file_path):
        """It parses the content of file_path and extracts relevant information
        from a TempEval-3 annotated file. Those information are packed in a
        Document object, which is our internal representation.
        """
        assert os.path.isfile(file_path), 'File path does not exist!'
        logging.info('Document {}: parsing...'.format(
            os.path.relpath(file_path)))
        xml = etree.parse(file_path)
        text_node = xml.findall(".//TEXT")[0]
        text_string = etree.tostring(text_node, method='text', encoding='utf8')
        text_xml = etree.tostring(text_node, method='xml', encoding='utf8')
        text_string = unicode(text_string, 'UTF-8')
        text_xml = unicode(text_xml, 'UTF-8')
        right_chars = len(text_xml.split('</TEXT>')[1])
        text_string = text_string[:-right_chars]
        text_xml = etree.tostring(text_node)

        # StanfordParser strips internally the text :(
        left_chars = len(text_string) - len(text_string.lstrip())
        with Mute_stderr():
            stanford_tree = CORENLP.parse(text_string)

        document = Document(file_path)
        document.text_offset = left_chars
        document.file_path = os.path.abspath(file_path)
        document.doc_id = os.path.basename(file_path)
        document.sec_times = self.get_dct(file_path)
        document.dct = document.sec_times.admission_date
        document.dct_text = document.dct.replace('-', '')
        document.title = os.path.basename(file_path)
        document.text = text_string
        document._coref = stanford_tree.get('coref', [])

        for num_sen, stanford_sentence in\
                enumerate(stanford_tree['sentences']):
            collp_deps = stanford_sentence.get('collapsed_dependencies', None)
            basic_deps = stanford_sentence.get('basic_dependencies', None)
            parsetree = stanford_sentence.get('parsetree', u'')

            sentence_text = stanford_sentence.get('text', u'')

            sentence = Sentence(id_sentence=num_sen,
                                basic_dependencies=basic_deps,
                                collapsed_dependencies=collp_deps,
                                parsetree=parsetree,
                                text=sentence_text)
            for num_word, (word_form, attr) in\
                    enumerate(stanford_sentence['words']):
                offset_begin = int(attr['CharacterOffsetBegin']) - left_chars
                offset_end = int(attr['CharacterOffsetEnd']) - left_chars
                word = Word(word_form=word_form,
                            char_offset_begin=offset_begin,
                            char_offset_end=offset_end,
                            lemma=attr['Lemma'],
                            named_entity_tag=attr['NamedEntityTag'],
                            part_of_speech=attr['PartOfSpeech'],
                            id_token=num_word,
                            id_sentence=num_sen)
                sentence.words.append(word)
            document.sentences.append(sentence)

        document.gold_annotations = self._get_annotations(
            xml, document)
        document.store_gold_annotations()
        document.complete_structure()

        logging.info('Document {}: parsed.'.format(os.path.relpath(file_path)))
        return document
Exemple #2
0
    def parse(self, file_path):
        """It parses the content of file_path and extracts relevant information
        from a TempEval-3 annotated file. Those information are packed in a
        Document object, which is our internal representation.
        """
        assert os.path.isfile(file_path), 'File path does not exist!'
        logging.info('Document {}: parsing...'.format(
            os.path.relpath(file_path)))
        xml = etree.parse(file_path)
        text_node = xml.findall(".//TEXT")[0]
        text_string = etree.tostring(text_node, method='text', encoding='utf8')
        text_xml = etree.tostring(text_node, method='xml', encoding='utf8')
        text_string = unicode(text_string, 'UTF-8')
        text_xml = unicode(text_xml, 'UTF-8')
        right_chars = len(text_xml.split('</TEXT>')[1])
        text_string = text_string[:-right_chars]
        text_xml = etree.tostring(text_node)

        # StanfordParser strips internally the text :(
        left_chars = len(text_string) - len(text_string.lstrip())
        with Mute_stderr():
            stanford_tree = CORENLP.parse(text_string)

        document = Document(file_path)
        document.text_offset = left_chars
        document.file_path = os.path.abspath(file_path)
        document.doc_id = os.path.basename(file_path)
        document.sec_times = self.get_dct(file_path)
        document.dct = document.sec_times.admission_date
        document.dct_text = document.dct.replace('-', '')
        document.title = os.path.basename(file_path)
        document.text = text_string
        document._coref = stanford_tree.get('coref', [])

        for num_sen, stanford_sentence in\
                enumerate(stanford_tree['sentences']):
            collp_deps = stanford_sentence.get('collapsed_dependencies', None)
            basic_deps = stanford_sentence.get('basic_dependencies', None)
            parsetree = stanford_sentence.get('parsetree', u'')

            sentence_text = stanford_sentence.get('text', u'')

            sentence = Sentence(id_sentence=num_sen,
                                basic_dependencies=basic_deps,
                                collapsed_dependencies=collp_deps,
                                parsetree=parsetree,
                                text=sentence_text)
            for num_word, (word_form, attr) in\
                    enumerate(stanford_sentence['words']):
                offset_begin = int(attr['CharacterOffsetBegin']) - left_chars
                offset_end = int(attr['CharacterOffsetEnd']) - left_chars
                word = Word(word_form=word_form,
                            char_offset_begin=offset_begin,
                            char_offset_end=offset_end,
                            lemma=attr['Lemma'],
                            named_entity_tag=attr['NamedEntityTag'],
                            part_of_speech=attr['PartOfSpeech'],
                            id_token=num_word,
                            id_sentence=num_sen)
                sentence.words.append(word)
            document.sentences.append(sentence)

        document.gold_annotations = self._get_annotations(xml, document)
        document.store_gold_annotations()
        document.complete_structure()

        logging.info('Document {}: parsed.'.format(os.path.relpath(file_path)))
        return document