Esempio n. 1
0
    def setUp(self):
        self.fact1 = Fact(
            "corpus1",
            "corpus_type",
            "timestamp_from",
            "timestamp_to",
            "timestamp_type",
            "analysis_type",
            "result_key",
            "result_value",
            "outlierness",
        )
        self.message1 = Message(self.fact1, 0.1, 0.2, 0.3)

        self.fact2 = Fact(
            "corpus2",
            "corpus_type",
            "timestamp_from",
            "timestamp_to",
            "timestamp_type",
            "analysis_type",
            "result_key",
            "result_value",
            "outlierness",
        )
        self.message2 = Message(self.fact2, 0.1, 0.2, 0.3)

        self.document_plan_node = DocumentPlanNode(
            [self.message1, self.message2], Relation.ELABORATION)
    def _recurse(
        self,
        registry: Registry,
        random: Generator,
        language: str,
        this: DocumentPlanNode,
        previous_entities: DefaultDict[str, None],
        encountered: Set[str],
    ) -> Tuple[Set[str], DefaultDict[str, None]]:
        """
        Traverses the DocumentPlan tree recursively in-order and modifies named
        entity to_value functions to return the chosen form of that NE's name.
        """
        if isinstance(this, Slot):
            if not self.is_entity(this.value):
                log.debug("Visited leaf non-NE leaf node {}".format(
                    this.value))
                return encountered, previous_entities

            log.debug("Visiting NE leaf {}".format(this.value))
            entity_type, entity = self.parse_entity(this.value)

            if previous_entities[entity_type] == entity:
                log.debug("Same as previous entity")
                this.attributes["name_type"] = "pronoun"

            elif entity in encountered:
                log.debug(
                    "Different entity than previous, but has been previously encountered"
                )
                this.attributes["name_type"] = "short"

            else:
                log.debug("First time encountering this entity")
                this.attributes["name_type"] = "full"
                encountered.add(entity)
                log.debug(
                    "Added entity to encountered, all encountered: {}".format(
                        encountered))

            self.resolve_surface_form(registry, random, language, this, entity,
                                      entity_type)
            log.debug("Resolved entity name")

            this.attributes["entity_type"] = entity_type
            previous_entities[entity_type] = entity

            return encountered, previous_entities
        elif isinstance(this, DocumentPlanNode):
            log.debug("Visiting non-leaf '{}'".format(this))
            for child in this.children:
                encountered, previous_entities = self._recurse(
                    registry, random, language, child, previous_entities,
                    encountered)
            return encountered, previous_entities
        return encountered, previous_entities
Esempio n. 3
0
    def run(self, registry: Registry, random: Generator, language: str,
            document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]:
        if log.isEnabledFor(logging.DEBUG):
            document_plan.print_tree()

        log.debug("Aggregating")
        self._aggregate(registry, language, document_plan)

        if log.isEnabledFor(logging.DEBUG):
            document_plan.print_tree()

        return (document_plan, )
    def run(self, registry: Registry, random: Generator, language: str,
            document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]:
        """
        Run this pipeline component.
        """
        log.info("Realizing dates")

        self._recurse(registry, random, language, document_plan, None)

        if log.isEnabledFor(logging.DEBUG):
            document_plan.print_tree()

        return (document_plan, )
Esempio n. 5
0
    def run(
        self, registry: Registry, random: Generator, language: str,
        scored_messages: List[Message]
    ) -> Tuple[DocumentPlanNode, List[Message]]:
        log.debug("Creating body document plan")

        # Root contains a sequence of children
        document_plan = DocumentPlanNode(children=[],
                                         relation=Relation.SEQUENCE)

        available_messages = scored_messages[:]  # Make a copy s.t. we can modify in place
        selected_nuclei: List[Message] = []

        while True:
            nucleus: Message
            nucleus_score: float
            nucleus, nucleus_score = self.select_next_nucleus(
                available_messages, selected_nuclei)

            if (nucleus_score < self.new_paragraph_absolute_threshold
                    or nucleus_score <
                    self.new_paragraph_relative_threshold(selected_nuclei)):
                if selected_nuclei:
                    return (document_plan, scored_messages)

            selected_nuclei.append(nucleus)

            # Messages are only allowed in the DP once
            available_messages = [
                m for m in available_messages if m != nucleus
            ]

            # Get a suitable amount of satellites
            satellites: List[Message] = self.select_satellites_for_nucleus(
                nucleus, available_messages)

            # Messages are only allowed in the DP once
            available_messages = [
                m for m in available_messages if m not in satellites
            ]

            document_plan.children.append(
                DocumentPlanNode([nucleus] + satellites, Relation.SEQUENCE))
Esempio n. 6
0
    def run(self, registry: Registry, random: Generator, language: str,
            scored_messages) -> Tuple[DocumentPlanNode, List[Message]]:
        """
        Run this pipeline component.
        """

        log.debug("Creating headline document plan")

        # Root contains a sequence of children
        document_plan = DocumentPlanNode(children=[],
                                         relation=Relation.SEQUENCE)

        headline_message, _ = self.select_next_nucleus(scored_messages, [])
        all_messages = scored_messages

        document_plan.children.append(
            DocumentPlanNode(children=[headline_message],
                             relation=Relation.SEQUENCE))

        return document_plan, all_messages
    def run(self, registry: Registry, random: Generator, language: str,
            document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]:
        """
        Run this pipeline component.
        """
        log.info("Running NER")

        if language.endswith("-head"):
            language = language[:-5]
            log.debug(
                "Language had suffix '-head', removing. Result: {}".format(
                    language))

        previous_entities = defaultdict(lambda: None)
        self._recurse(registry, random, language, document_plan,
                      previous_entities, set())

        if log.isEnabledFor(logging.DEBUG):
            document_plan.print_tree()

        return (document_plan, )
    def run(
        self,
        registry: Registry,
        random: Generator,
        language: str,
        document_plan: DocumentPlanNode,
        all_messages: List[Message],
    ) -> Tuple[DocumentPlanNode]:
        """
        Run this pipeline component.
        """
        if log.isEnabledFor(logging.DEBUG):
            document_plan.print_tree()

        templates = registry.get("templates")[language]

        template_checker = TemplateMessageChecker(templates, all_messages)
        log.info("Selecting templates from {} templates".format(len(templates)))
        self._recurse(random, language, document_plan, all_messages, template_checker)

        return (document_plan,)
Esempio n. 9
0
class TestDocumentPlanNode(TestCase):
    def setUp(self):
        self.fact1 = Fact(
            "corpus1",
            "corpus_type",
            "timestamp_from",
            "timestamp_to",
            "timestamp_type",
            "analysis_type",
            "result_key",
            "result_value",
            "outlierness",
        )
        self.message1 = Message(self.fact1, 0.1, 0.2, 0.3)

        self.fact2 = Fact(
            "corpus2",
            "corpus_type",
            "timestamp_from",
            "timestamp_to",
            "timestamp_type",
            "analysis_type",
            "result_key",
            "result_value",
            "outlierness",
        )
        self.message2 = Message(self.fact2, 0.1, 0.2, 0.3)

        self.document_plan_node = DocumentPlanNode(
            [self.message1, self.message2], Relation.ELABORATION)

    def test_document_plan_node_creation_sets_values(self):
        self.assertListEqual(self.document_plan_node.children,
                             [self.message1, self.message2])
        self.assertEqual(self.document_plan_node.relation,
                         Relation.ELABORATION)
        self.assertEqual(str(self.document_plan_node), "ELABORATION")

    def test_document_plan_node_print_tree_does_not_crash(self):
        self.document_plan_node.print_tree()
    def run(self, registry: Registry, random: Generator, language: str,
            document_plan: DocumentPlanNode) -> Tuple[DocumentPlanNode]:
        """
        Run this pipeline component.
        """
        log.info("Running Morphological Realizer")

        if language.endswith("-head"):
            language = language[:-5]
            log.debug(
                "Language had suffix '-head', removing. Result: {}".format(
                    language))

        if language not in self.language_realizers:
            log.warning(
                "No morphological realizer for language {}".format(language))
            return (document_plan, )

        self._recurse(language, document_plan)

        if log.isEnabledFor(logging.DEBUG):
            document_plan.print_tree()

        return (document_plan, )
Esempio n. 11
0
 def _recurse(self, this: DocumentPlanNode, language: str) -> bool:
     if not isinstance(this, Message):
         log.debug("Visiting '{}'".format(this))
         return any(self._recurse(child, language) for child in this.children)
     else:
         log.debug("Visiting {}".format(this))
         any_modified = False
         # Use indexes to iterate through the children since the template slots may be edited, added or replaced
         # during iteration. Ugly, but will do for now.
         idx = 0
         while idx < len(this.children):
             child = this.children[idx]
             log.debug("Visiting child {}".format(child))
             if not isinstance(child, Slot):
                 idx += 1
                 continue
             modified_components = self._realize_slot(language, child)
             if modified_components != [child]:
                 any_modified = True
             this.children[idx : idx + 1] = modified_components
             idx += len(modified_components)
         return any_modified
    def _recurse(
        self,
        registry: Registry,
        random: Generator,
        language: str,
        this: DocumentPlanNode,
        previous_entity: Optional[str],
    ) -> Optional[str]:
        """
        Traverses the DocumentPlan tree recursively in-order and modifies named
        entity to_value functions to return the chosen form of that NE's name.
        """
        idx = 0
        while idx < len(this.children):
            child = this.children[idx]
            if isinstance(child, Slot):
                if not isinstance(
                        child.value, str
                ) or child.value[0] != "[" or child.value[-1] != "]":
                    log.debug("Visited non-tag leaf node {}".format(
                        child.value))
                    idx += 1
                    continue

                segments = child.value[1:-1].split(":")
                if segments[0] != "TIME":
                    log.debug("Visited non-TIME leaf node {}".format(
                        child.value))
                    idx += 1
                    continue

                timestamp_type = segments[1]
                if timestamp_type == "month":
                    new_value = self._realize_month(child, previous_entity)
                elif timestamp_type == "year":
                    new_value = self._realize_year(child, previous_entity)
                elif timestamp_type == "between_years":
                    new_value = self._realize_between_years(
                        child, previous_entity)
                else:
                    log.error(
                        "Visited TIME leaf node {} but couldn't realize it!".
                        format(child.value))
                    idx + 1
                    continue

                if isinstance(new_value, list):
                    new_value = random.choice(new_value)

                original_value = child.value
                new_components = []
                for component_idx, realization_token in enumerate(
                        new_value.split()):
                    new_slot = child.copy(include_fact=True)

                    # By default, copy copies the attributes too. In case attach_attributes_to was set,
                    # we need to explicitly reset the attributes for all those slots NOT explicitly mentioned
                    if (self.attach_attributes
                            and timestamp_type in self.attach_attributes
                            and component_idx
                            not in self.attach_attributes[timestamp_type]):
                        new_slot.attributes = {}

                    # An ugly hack that ensures the lambda correctly binds to the value of realization_token at this
                    # time. Without this, all the lambdas bind to the final value of the realization_token variable, ie.
                    # the final value at the end of the loop.  See https://stackoverflow.com/a/10452819
                    new_slot.value = lambda f, realization_token=realization_token: realization_token
                    new_components.append(new_slot)

                this.children[idx:idx + 1] = new_components
                idx += len(new_components)
                log.debug(
                    "Visited TIME leaf node {} and realized it as {}".format(
                        original_value, new_value))
                previous_entity = original_value
            elif isinstance(child, DocumentPlanNode):
                log.debug("Visiting non-leaf '{}'".format(child))
                previous_entity = self._recurse(registry, random, language,
                                                child, previous_entity)
                idx += 1
            else:
                # Neither DocumentPlan nor Slot, must be f.ex. Literal -> skip.
                idx += 1
        return previous_entity