def segment_permutation(self, doc, canonical_doc): assert len(doc.sentences) == len(canonical_doc.sentences) doc.edu_word_segmentation = [] doc.cuts = [] doc.edus = [] sentence_order = [] for sent in doc.sentences: index = 0 while index < len(canonical_doc.sentences): if canonical_doc.sentences[ index].raw_text == sent.raw_text and index not in sentence_order: break index += 1 sentence_order.append(index) assert sorted(sentence_order) == range(len(doc.sentences)) for (i, index) in enumerate(sentence_order): sentence = doc.sentences[i] sentence.set_unlexicalized_tree( canonical_doc.sentences[index].unlexicalized_parse_tree) sentence.set_lexicalized_tree( canonical_doc.sentences[index].parse_tree) sentence.tokens = [] for te in canonical_doc.sentences[index].tokens: token = Token(te.word, te.id, sentence) token.lemma = te.lemma sentence.add_token(token) (canonical_start_edu, canonical_end_edu) = canonical_doc.cuts[index] edus = canonical_doc.edus[canonical_start_edu:canonical_end_edu] sentence.start_edu = len(sentence.doc.edus) sentence.end_edu = len(sentence.doc.edus) + len(edus) doc.cuts.append( (len(sentence.doc.edus), len(sentence.doc.edus) + len(edus))) doc.edus.extend(edus) doc.edu_word_segmentation.append( canonical_doc.edu_word_segmentation[index]) doc.start_edu = 0 doc.end_edu = len(doc.edus)
def process_single_sentence(self, doc, raw_text, end_of_para): sentence = Sentence(len(doc.sentences), raw_text + ('<s>' if not end_of_para else '<P>'), doc) parse_tree_str, deps_str = self.parse_single_sentence(raw_text) parse = LexicalizedTree.parse(parse_tree_str, leaf_pattern = '(?<=\\s)[^\)\(]+') sentence.set_unlexicalized_tree(parse) for (token_id, te) in enumerate(parse.leaves()): word = te token = Token(word, token_id + 1, sentence) sentence.add_token(token) heads = self.get_heads(sentence, deps_str.split('\n')) sentence.heads = heads sentence.set_lexicalized_tree(prep_utils.create_lexicalized_tree(parse, heads)) doc.add_sentence(sentence)
def segment_permutation(self, doc, canonical_doc): assert len(doc.sentences) == len(canonical_doc.sentences) doc.edu_word_segmentation = [] doc.cuts = [] doc.edus = [] sentence_order = [] for sent in doc.sentences: index = 0 # print sent.sent_id, sent.raw_text while index < len(canonical_doc.sentences): # print 'canonical', index, canonical_doc.sentences[index].raw_text if canonical_doc.sentences[index].raw_text == sent.raw_text and index not in sentence_order: # print index # print canonical_doc.sentences[index].raw_text # print sent.raw_text # print break index += 1 # if index == len(canonical_doc.sentences): # print sent.sent_id, sent.raw_text # print canonical_doc.sentences[index - 1].raw_text # print # print # print sent.sent_id, index sentence_order.append(index) # print sentence_order # print sorted(sentence_order) assert sorted(sentence_order) == range(len(doc.sentences)) for (i, index) in enumerate(sentence_order): sentence = doc.sentences[i] sentence.set_unlexicalized_tree(canonical_doc.sentences[index].unlexicalized_parse_tree) sentence.set_lexicalized_tree(canonical_doc.sentences[index].parse_tree) sentence.tokens = [] for te in canonical_doc.sentences[index].tokens: token = Token(te.word, te.id, sentence) token.lemma = te.lemma sentence.add_token(token) # print i # print sentence.raw_text # print canonical_doc.sentences[index].raw_text # print (canonical_start_edu, canonical_end_edu) = canonical_doc.cuts[index] edus = canonical_doc.edus[canonical_start_edu : canonical_end_edu] sentence.start_edu = len(sentence.doc.edus) sentence.end_edu = len(sentence.doc.edus) + len(edus) doc.cuts.append((len(sentence.doc.edus), len(sentence.doc.edus) + len(edus))) doc.edus.extend(edus) doc.edu_word_segmentation.append(canonical_doc.edu_word_segmentation[index]) doc.start_edu = 0 doc.end_edu = len(doc.edus)