Beispiel #1
0
    def segment_permutation(self, doc, canonical_doc):
        assert len(doc.sentences) == len(canonical_doc.sentences)

        doc.edu_word_segmentation = []
        doc.cuts = []
        doc.edus = []

        sentence_order = []
        for sent in doc.sentences:
            index = 0
            while index < len(canonical_doc.sentences):
                if canonical_doc.sentences[
                        index].raw_text == sent.raw_text and index not in sentence_order:
                    break
                index += 1
            sentence_order.append(index)

        assert sorted(sentence_order) == range(len(doc.sentences))

        for (i, index) in enumerate(sentence_order):
            sentence = doc.sentences[i]
            sentence.set_unlexicalized_tree(
                canonical_doc.sentences[index].unlexicalized_parse_tree)
            sentence.set_lexicalized_tree(
                canonical_doc.sentences[index].parse_tree)

            sentence.tokens = []

            for te in canonical_doc.sentences[index].tokens:
                token = Token(te.word, te.id, sentence)
                token.lemma = te.lemma
                sentence.add_token(token)

            (canonical_start_edu,
             canonical_end_edu) = canonical_doc.cuts[index]

            edus = canonical_doc.edus[canonical_start_edu:canonical_end_edu]
            sentence.start_edu = len(sentence.doc.edus)
            sentence.end_edu = len(sentence.doc.edus) + len(edus)
            doc.cuts.append(
                (len(sentence.doc.edus), len(sentence.doc.edus) + len(edus)))
            doc.edus.extend(edus)
            doc.edu_word_segmentation.append(
                canonical_doc.edu_word_segmentation[index])

        doc.start_edu = 0
        doc.end_edu = len(doc.edus)
    def process_single_sentence(self, doc, raw_text, end_of_para):
        sentence = Sentence(len(doc.sentences), raw_text + ('<s>' if not end_of_para else '<P>'), doc)
        parse_tree_str, deps_str = self.parse_single_sentence(raw_text)

        parse = LexicalizedTree.parse(parse_tree_str, leaf_pattern = '(?<=\\s)[^\)\(]+')  
        sentence.set_unlexicalized_tree(parse)
        
        for (token_id, te) in enumerate(parse.leaves()):
            word = te
            token = Token(word, token_id + 1, sentence)
            sentence.add_token(token)

        heads = self.get_heads(sentence, deps_str.split('\n'))
        sentence.heads = heads
        sentence.set_lexicalized_tree(prep_utils.create_lexicalized_tree(parse, heads))
     
        doc.add_sentence(sentence)
    def segment_permutation(self, doc, canonical_doc):
        assert len(doc.sentences) == len(canonical_doc.sentences)
        
        doc.edu_word_segmentation = []
        doc.cuts = []
        doc.edus = []
        
        sentence_order = []
        for sent in doc.sentences:
            index = 0
#            print sent.sent_id, sent.raw_text
            while index < len(canonical_doc.sentences):
#                print 'canonical', index, canonical_doc.sentences[index].raw_text
                if canonical_doc.sentences[index].raw_text == sent.raw_text and index not in sentence_order:
#                    print index
#                    print canonical_doc.sentences[index].raw_text
#                    print sent.raw_text
#                    print
                    break
                 
                index += 1
                
#            if index == len(canonical_doc.sentences):
#                print sent.sent_id, sent.raw_text
#                print canonical_doc.sentences[index - 1].raw_text
#                print
                   
#            print
#            print sent.sent_id, index
            sentence_order.append(index)
        
#        print sentence_order
#        print sorted(sentence_order)
        assert sorted(sentence_order) == range(len(doc.sentences))
        
        for (i, index) in enumerate(sentence_order):
            sentence = doc.sentences[i]
            sentence.set_unlexicalized_tree(canonical_doc.sentences[index].unlexicalized_parse_tree)
            sentence.set_lexicalized_tree(canonical_doc.sentences[index].parse_tree)
            
            sentence.tokens = []
            
            for te in canonical_doc.sentences[index].tokens:
                token = Token(te.word, te.id, sentence)
                token.lemma = te.lemma
                sentence.add_token(token)
                
#            print i
#            print sentence.raw_text
#            print canonical_doc.sentences[index].raw_text
#            print
            
            (canonical_start_edu, canonical_end_edu) = canonical_doc.cuts[index]
            
            edus = canonical_doc.edus[canonical_start_edu : canonical_end_edu]
            sentence.start_edu = len(sentence.doc.edus)
            sentence.end_edu = len(sentence.doc.edus) + len(edus)
            doc.cuts.append((len(sentence.doc.edus), len(sentence.doc.edus) + len(edus)))
            doc.edus.extend(edus)
            doc.edu_word_segmentation.append(canonical_doc.edu_word_segmentation[index])
            
        doc.start_edu = 0
        doc.end_edu = len(doc.edus)