def create_bundle(self): """Create a new bundle and add it at the end of the document.""" self._highest_bundle_id += 1 bundle = Bundle(document=self, bundle_id=str(self._highest_bundle_id)) self.bundles.append(bundle) bundle.number = len(self.bundles) return bundle
def process_document(self, doc): tok, tag, par = self.tokenize, self.tag, self.parse old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: for tree in bundle: new_bundles.append(bundle) if self._should_process_tree(tree): if tok: new_trees = self.tool.tokenize_tag_parse_tree( tree, resegment=self.resegment, tag=self.tag, parse=self.parse) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' tree.text = None for i, new_tree in enumerate(new_trees[1:], 2): new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) new_tree.zone = tree.zone new_bundle.add_tree(new_tree) new_bundles.append(new_bundle) elif not tok and tag and par: self.tool.tag_parse_tree(tree) else: raise ValueError( "Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) doc.bundles = new_bundles
def process_document(self, doc): old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: new_bundles.append(bundle) for tree in bundle: if self._should_process_tree(tree): if tree.children: raise ValueError("Segmenting already tokenized text is not supported.") sentences = self.segment_string(tree.text) orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' if len(sentences) > 1: tree.text = sentences[0] for i, sentence in enumerate(sentences[1:], 2): new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) new_bundle.create_tree(tree.zone).text = sentence new_bundles.append(new_bundle) doc.bundles = new_bundles