def setUp(self): self.fact1 = Fact( "corpus1", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.message1 = Message(self.fact1, 0.1, 0.2, 0.3) self.fact2 = Fact( "corpus2", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.message2 = Message(self.fact2, 0.1, 0.2, 0.3) self.document_plan_node = DocumentPlanNode( [self.message1, self.message2], Relation.ELABORATION)
def _recurse( self, registry: Registry, random: Generator, language: str, this: DocumentPlanNode, previous_entities: DefaultDict[str, None], encountered: Set[str], ) -> Tuple[Set[str], DefaultDict[str, None]]: """ Traverses the DocumentPlan tree recursively in-order and modifies named entity to_value functions to return the chosen form of that NE's name. """ if isinstance(this, Slot): if not self.is_entity(this.value): log.debug("Visited leaf non-NE leaf node {}".format( this.value)) return encountered, previous_entities log.debug("Visiting NE leaf {}".format(this.value)) entity_type, entity = self.parse_entity(this.value) if previous_entities[entity_type] == entity: log.debug("Same as previous entity") this.attributes["name_type"] = "pronoun" elif entity in encountered: log.debug( "Different entity than previous, but has been previously encountered" ) this.attributes["name_type"] = "short" else: log.debug("First time encountering this entity") this.attributes["name_type"] = "full" encountered.add(entity) log.debug( "Added entity to encountered, all encountered: {}".format( encountered)) self.resolve_surface_form(registry, random, language, this, entity, entity_type) log.debug("Resolved entity name") this.attributes["entity_type"] = entity_type previous_entities[entity_type] = entity return encountered, previous_entities elif isinstance(this, DocumentPlanNode): log.debug("Visiting non-leaf '{}'".format(this)) for child in this.children: encountered, previous_entities = self._recurse( registry, random, language, child, previous_entities, encountered) return encountered, previous_entities return encountered, previous_entities
def run(self, registry: Registry, random: Generator, language: str, document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]: if log.isEnabledFor(logging.DEBUG): document_plan.print_tree() log.debug("Aggregating") self._aggregate(registry, language, document_plan) if log.isEnabledFor(logging.DEBUG): document_plan.print_tree() return (document_plan, )
def run(self, registry: Registry, random: Generator, language: str, document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]: """ Run this pipeline component. """ log.info("Realizing dates") self._recurse(registry, random, language, document_plan, None) if log.isEnabledFor(logging.DEBUG): document_plan.print_tree() return (document_plan, )
def run( self, registry: Registry, random: Generator, language: str, scored_messages: List[Message] ) -> Tuple[DocumentPlanNode, List[Message]]: log.debug("Creating body document plan") # Root contains a sequence of children document_plan = DocumentPlanNode(children=[], relation=Relation.SEQUENCE) available_messages = scored_messages[:] # Make a copy s.t. we can modify in place selected_nuclei: List[Message] = [] while True: nucleus: Message nucleus_score: float nucleus, nucleus_score = self.select_next_nucleus( available_messages, selected_nuclei) if (nucleus_score < self.new_paragraph_absolute_threshold or nucleus_score < self.new_paragraph_relative_threshold(selected_nuclei)): if selected_nuclei: return (document_plan, scored_messages) selected_nuclei.append(nucleus) # Messages are only allowed in the DP once available_messages = [ m for m in available_messages if m != nucleus ] # Get a suitable amount of satellites satellites: List[Message] = self.select_satellites_for_nucleus( nucleus, available_messages) # Messages are only allowed in the DP once available_messages = [ m for m in available_messages if m not in satellites ] document_plan.children.append( DocumentPlanNode([nucleus] + satellites, Relation.SEQUENCE))
def run(self, registry: Registry, random: Generator, language: str, scored_messages) -> Tuple[DocumentPlanNode, List[Message]]: """ Run this pipeline component. """ log.debug("Creating headline document plan") # Root contains a sequence of children document_plan = DocumentPlanNode(children=[], relation=Relation.SEQUENCE) headline_message, _ = self.select_next_nucleus(scored_messages, []) all_messages = scored_messages document_plan.children.append( DocumentPlanNode(children=[headline_message], relation=Relation.SEQUENCE)) return document_plan, all_messages
def run(self, registry: Registry, random: Generator, language: str, document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]: """ Run this pipeline component. """ log.info("Running NER") if language.endswith("-head"): language = language[:-5] log.debug( "Language had suffix '-head', removing. Result: {}".format( language)) previous_entities = defaultdict(lambda: None) self._recurse(registry, random, language, document_plan, previous_entities, set()) if log.isEnabledFor(logging.DEBUG): document_plan.print_tree() return (document_plan, )
def run( self, registry: Registry, random: Generator, language: str, document_plan: DocumentPlanNode, all_messages: List[Message], ) -> Tuple[DocumentPlanNode]: """ Run this pipeline component. """ if log.isEnabledFor(logging.DEBUG): document_plan.print_tree() templates = registry.get("templates")[language] template_checker = TemplateMessageChecker(templates, all_messages) log.info("Selecting templates from {} templates".format(len(templates))) self._recurse(random, language, document_plan, all_messages, template_checker) return (document_plan,)
class TestDocumentPlanNode(TestCase): def setUp(self): self.fact1 = Fact( "corpus1", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.message1 = Message(self.fact1, 0.1, 0.2, 0.3) self.fact2 = Fact( "corpus2", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.message2 = Message(self.fact2, 0.1, 0.2, 0.3) self.document_plan_node = DocumentPlanNode( [self.message1, self.message2], Relation.ELABORATION) def test_document_plan_node_creation_sets_values(self): self.assertListEqual(self.document_plan_node.children, [self.message1, self.message2]) self.assertEqual(self.document_plan_node.relation, Relation.ELABORATION) self.assertEqual(str(self.document_plan_node), "ELABORATION") def test_document_plan_node_print_tree_does_not_crash(self): self.document_plan_node.print_tree()
def run(self, registry: Registry, random: Generator, language: str, document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]: """ Run this pipeline component. """ log.info("Running Morphological Realizer") if language.endswith("-head"): language = language[:-5] log.debug( "Language had suffix '-head', removing. Result: {}".format( language)) if language not in self.language_realizers: log.warning( "No morphological realizer for language {}".format(language)) return (document_plan, ) self._recurse(language, document_plan) if log.isEnabledFor(logging.DEBUG): document_plan.print_tree() return (document_plan, )
def _recurse(self, this: DocumentPlanNode, language: str) -> bool: if not isinstance(this, Message): log.debug("Visiting '{}'".format(this)) return any(self._recurse(child, language) for child in this.children) else: log.debug("Visiting {}".format(this)) any_modified = False # Use indexes to iterate through the children since the template slots may be edited, added or replaced # during iteration. Ugly, but will do for now. idx = 0 while idx < len(this.children): child = this.children[idx] log.debug("Visiting child {}".format(child)) if not isinstance(child, Slot): idx += 1 continue modified_components = self._realize_slot(language, child) if modified_components != [child]: any_modified = True this.children[idx : idx + 1] = modified_components idx += len(modified_components) return any_modified
def _recurse( self, registry: Registry, random: Generator, language: str, this: DocumentPlanNode, previous_entity: Optional[str], ) -> Optional[str]: """ Traverses the DocumentPlan tree recursively in-order and modifies named entity to_value functions to return the chosen form of that NE's name. """ idx = 0 while idx < len(this.children): child = this.children[idx] if isinstance(child, Slot): if not isinstance( child.value, str ) or child.value[0] != "[" or child.value[-1] != "]": log.debug("Visited non-tag leaf node {}".format( child.value)) idx += 1 continue segments = child.value[1:-1].split(":") if segments[0] != "TIME": log.debug("Visited non-TIME leaf node {}".format( child.value)) idx += 1 continue timestamp_type = segments[1] if timestamp_type == "month": new_value = self._realize_month(child, previous_entity) elif timestamp_type == "year": new_value = self._realize_year(child, previous_entity) elif timestamp_type == "between_years": new_value = self._realize_between_years( child, previous_entity) else: log.error( "Visited TIME leaf node {} but couldn't realize it!". format(child.value)) idx + 1 continue if isinstance(new_value, list): new_value = random.choice(new_value) original_value = child.value new_components = [] for component_idx, realization_token in enumerate( new_value.split()): new_slot = child.copy(include_fact=True) # By default, copy copies the attributes too. In case attach_attributes_to was set, # we need to explicitly reset the attributes for all those slots NOT explicitly mentioned if (self.attach_attributes and timestamp_type in self.attach_attributes and component_idx not in self.attach_attributes[timestamp_type]): new_slot.attributes = {} # An ugly hack that ensures the lambda correctly binds to the value of realization_token at this # time. Without this, all the lambdas bind to the final value of the realization_token variable, ie. # the final value at the end of the loop. See https://stackoverflow.com/a/10452819 new_slot.value = lambda f, realization_token=realization_token: realization_token new_components.append(new_slot) this.children[idx:idx + 1] = new_components idx += len(new_components) log.debug( "Visited TIME leaf node {} and realized it as {}".format( original_value, new_value)) previous_entity = original_value elif isinstance(child, DocumentPlanNode): log.debug("Visiting non-leaf '{}'".format(child)) previous_entity = self._recurse(registry, random, language, child, previous_entity) idx += 1 else: # Neither DocumentPlan nor Slot, must be f.ex. Literal -> skip. idx += 1 return previous_entity