Ejemplo n.º 1
0
    def get_order(self, sentence: AMSentence) -> Iterable[Decision]:
        t = Tree.from_am_sentence(sentence)
        term_types = get_term_types(sentence)

        def _construct_seq(tree: Tree, is_first_child: bool,
                           parent_type: Tuple[str, str], parent_lex_label: str,
                           parent_term_type: AMType) -> List[Decision]:
            own_position = tree.node[0]
            to_left = []
            to_right = []
            for child in tree.children:
                if child.node[1].label == "IGNORE":
                    continue

                if child.node[0] < own_position:
                    to_left.append(child)
                else:
                    to_right.append(child)

            if is_first_child:
                beginning = [
                    Decision(own_position, False, tree.node[1].label,
                             parent_type, parent_lex_label, parent_term_type)
                ]
            else:
                beginning = [
                    Decision(own_position, False, tree.node[1].label, ("", ""),
                             "")
                ]

            if self.children_order == "LR":
                children = to_left + to_right
            elif self.children_order == "IO":
                children = list(reversed(to_left)) + to_right
            else:
                raise ValueError("Unknown children order: " +
                                 self.children_order)

            ret = beginning
            for i, child in enumerate(children):
                ret.extend(
                    _construct_seq(child, i == 0,
                                   (tree.node[1].fragment, tree.node[1].typ),
                                   tree.node[1].lexlabel,
                                   term_types[own_position - 1]))

            last_position = 0 if self.pop_with_0 else own_position
            if len(tree.children) == 0:
                #This subtree has no children, thus also no first child at which we would determine the type of the parent
                #Let's determine the type now.
                last_decision = Decision(
                    last_position, True, "",
                    (tree.node[1].fragment, tree.node[1].typ),
                    tree.node[1].lexlabel, term_types[own_position - 1])
            else:
                last_decision = Decision(last_position, True, "", ("", ""), "")
            ret.append(last_decision)
            return ret

        return _construct_seq(t, False, ("", ""), "", AMType.parse_str("_"))
Ejemplo n.º 2
0
def typ2i(additional_lexicon: AdditionalLexicon) -> Dict[AMType, int]:
    _typ2i: Dict[AMType, int] = dict()
    for typ, i in additional_lexicon.sublexica["term_types"]:
        try:
            _typ2i[AMType.parse_str(typ)] = i
        except NonAMTypeException:
            pass
    return _typ2i
 def normalize_types(self) -> "AMSentence":
     """
     Parse the types and convert them to strings again to normalize them.
     :return:
     """
     return AMSentence([
         Entry(word.token, word.replacement, word.lemma, word.pos_tag,
               word.ner_tag, word.fragment, word.lexlabel,
               str(AMType.parse_str(word.typ)), word.head, word.label,
               word.aligned, word.range)
         for i, word in enumerate(self.words)
     ], self.attributes)
Ejemplo n.º 4
0
def typ2supertag(lexicon: AdditionalLexicon) -> Dict[AMType, Set[int]]:
    _typ2supertag: Dict[
        AMType, Set[int]] = dict()  #which supertags have the given type?

    for supertag, i in lexicon.sublexica["constants"]:
        _, typ = AMSentence.split_supertag(supertag)
        try:
            typ = AMType.parse_str(typ)
            if typ not in _typ2supertag:
                _typ2supertag[typ] = set()

            _typ2supertag[typ].add(i)
        except NonAMTypeException:
            print("Skipping type", typ)
    return _typ2supertag
Ejemplo n.º 5
0
    def initial_state(self, sentence: AMSentence,
                      decoder_state: Any) -> ParsingState:
        stack = [0]
        seen = set()
        heads = [0 for _ in range(len(sentence))]
        children = {i: [] for i in range(len(sentence) + 1)}
        labels = ["IGNORE" for _ in range(len(sentence))]
        lex_labels = ["_" for _ in range(len(sentence))]
        supertags = [("_", "_") for _ in range(len(sentence))]
        lexical_types = [AMType.parse_str("_") for _ in range(len(sentence))]
        term_types = [set() for _ in range(len(sentence))]
        applysets_todo = [None for _ in range(len(sentence))]

        return LTFState(decoder_state, 0, 0.0, sentence,
                        self.additional_lexicon, heads, children, labels,
                        supertags, lex_labels, stack, seen, lexical_types,
                        term_types, applysets_todo, len(sentence), False)
Ejemplo n.º 6
0
    def initial_state(self, sentence: AMSentence,
                      decoder_state: Any) -> ParsingState:
        stack = [0]
        seen = set()
        substack = []
        heads = [0 for _ in range(len(sentence))]
        children = {i: [] for i in range(len(sentence) + 1)}
        labels = ["IGNORE" for _ in range(len(sentence))]
        lex_labels = ["_" for _ in range(len(sentence))]
        constants = [("_", "_") for _ in range(len(sentence))]
        lexical_types = [AMType.parse_str("_") for _ in range(len(sentence))]
        term_types = [None for _ in range(len(sentence))]
        applysets_collected = [None for _ in range(len(sentence))]

        return LTLState(decoder_state, 0, 0.0, sentence,
                        self.additional_lexicon, heads, children, labels,
                        constants, lex_labels, stack, seen, substack,
                        lexical_types, term_types, applysets_collected,
                        len(sentence), False, [0 for _ in sentence])
Ejemplo n.º 7
0
    def __init__(self, children_order: str, pop_with_0: bool,
                 additional_lexicon: AdditionalLexicon,
                 reverse_push_actions: bool = False,
                 enable_assert : bool = False):
        """
        Select children_order : "LR" (left to right) or "IO" (inside-out, recommended by Ma et al.)
        reverse_push_actions means that the order of push actions is the opposite order in which the children of
        the node are recursively visited.
        """
        super().__init__(children_order, pop_with_0, additional_lexicon, reverse_push_actions)
        self.enable_assert = enable_assert

        self.i2source = sorted({label.split("_")[1] for label, _ in self.additional_lexicon.sublexica["edge_labels"] if "_" in label})
        self.source2i = {s: i for i, s in enumerate(self.i2source)}
        len_sources = len(self.i2source)

        self.additional_apps = ["APP_" + source for source in self.i2source if not self.additional_lexicon.contains("edge_labels", "APP_" + source)]
        self.additional_lexicon.sublexica["edge_labels"].add(self.additional_apps)
        len_labels = self.additional_lexicon.vocab_size("edge_labels")

        all_lex_types = {AMSentence.split_supertag(lextyp)[1] for lextyp, _ in self.additional_lexicon.sublexica["constants"] if "--TYPE--" in lextyp}
        self.i2lextyp = sorted(all_lex_types)
        self.lextyp2i : Dict[AMType, int] = { AMType.parse_str(l) : i for i, l in enumerate(self.i2lextyp)}
        len_lex_typ = len(self.i2lextyp)

        lexical2constant = np.zeros((len_lex_typ, self.additional_lexicon.vocab_size("constants")), dtype=np.bool) #shape (lexical type, constant)
        constant2lexical = np.zeros(self.additional_lexicon.vocab_size("constants"), dtype=np.long)

        get_term_types = np.zeros((len_lex_typ, len_labels, len_lex_typ), dtype=np.bool) #shape (parent lexical type, incoming label, term type)
        applyset_term_types = np.zeros((len_lex_typ, len_lex_typ, len_sources), dtype=np.bool) # shape (TERM TYPE, lexical type, source)
        apply_reachable_term_types = np.zeros((len_lex_typ, len_lex_typ), dtype=np.bool) # shape (TERM type, lexial type)

        #self.mod_cache = ModCache([AMType.parse_str(t) for t in all_lex_types])

        for constant,constant_id in self.additional_lexicon.sublexica["constants"]:
            lex_type = AMType.parse_str(AMSentence.split_supertag(constant)[1])
            lexical2constant[self.lextyp2i[lex_type], constant_id] = 1
            constant2lexical[constant_id] = self.lextyp2i[lex_type]

        apply_reachable_from : Dict[AMType, Set[Tuple[AMType, frozenset]]] = dict()
        for t1 in self.lextyp2i.keys():
            if t1.is_bot:
                continue
            for t2 in self.lextyp2i.keys():
                if t2.is_bot:
                    continue
                applyset = t1.get_apply_set(t2)
                if applyset is not None:
                    if t2 not in apply_reachable_from:
                        apply_reachable_from[t2] = set()
                    apply_reachable_from[t2].add((t1, frozenset(applyset)))

        root_id = self.additional_lexicon.get_id("edge_labels", "ROOT")
        for parent_lex_typ, parent_id in self.lextyp2i.items():
            # ROOT
            # root requires empty term type, thus all sources must be removed
            get_term_types[parent_id, root_id, parent_id] = True
            for current_lex_type in self.lextyp2i.keys():
                if current_lex_type.is_bot:
                    continue
                current_typ_id = self.lextyp2i[current_lex_type]
                apply_reachable_term_types[current_typ_id, current_typ_id] = True

            # MOD
            for source, t in self.mod_cache.get_modifiers(parent_lex_typ):
                smallest_apply_set : Dict[Tuple[int, str], Set[str]] = dict()
                if self.additional_lexicon.contains("edge_labels", "MOD_"+source):
                    label_id = self.additional_lexicon.get_id("edge_labels", "MOD_"+source)

                    get_term_types[parent_id, label_id, self.lextyp2i[t]] = True

                    for possible_lexical_type, applyset in apply_reachable_from[t]:
                        current_typ_id = self.lextyp2i[possible_lexical_type]

                        apply_reachable_term_types[self.lextyp2i[t], current_typ_id] = True
                        for source in applyset:
                            source_id = self.source2i[source]
                            applyset_term_types[self.lextyp2i[t], current_typ_id, source_id] = 1


            # APP
            for source in parent_lex_typ.nodes():
                req = parent_lex_typ.get_request(source)
                label_id = self.additional_lexicon.get_id("edge_labels", "APP_"+source)

                get_term_types[parent_id, label_id, self.lextyp2i[req]] = True

                for possible_lexical_type, applyset in apply_reachable_from[req]:
                    current_typ_id = self.lextyp2i[possible_lexical_type]

                    apply_reachable_term_types[self.lextyp2i[req], current_typ_id] = True
                    for source in applyset:
                        source_id = self.source2i[source]
                        applyset_term_types[self.lextyp2i[req], current_typ_id, source_id] = 1


        self.lexical2constant = torch.from_numpy(lexical2constant)
        self.constant2lexical = torch.from_numpy(constant2lexical)

        self.app_source2label_id = torch.zeros((len_sources, len_labels), dtype=torch.bool) # maps a source id to the respective (APP) label id
        self.mod_tensor = torch.zeros(len_labels, dtype=torch.bool) #which label ids are MOD_ edge labels?
        self.label_id2appsource = torch.zeros(len_labels, dtype=torch.long)-1
        self.applyset_term_types = torch.from_numpy(applyset_term_types) # shape (TERM TYPE, lexical type, source); is the given source in the apply set from the lexical type to the term type?
        self.get_term_types = torch.from_numpy(get_term_types) #shape (parent lexical type, incoming label, term type)
        self.apply_reachable_term_types = torch.from_numpy(apply_reachable_term_types) # shape (TERM type, lexial type)

        for label, label_id in self.additional_lexicon.sublexica["edge_labels"]:
            if label.startswith("MOD_"):
                self.mod_tensor[label_id] = True

        for source, source_id in self.source2i.items():
            label_id = self.additional_lexicon.get_id("edge_labels", "APP_"+source)
            self.label_id2appsource[label_id] = source_id
            self.app_source2label_id[source_id, label_id] = True
Ejemplo n.º 8
0
    def step(self,
             state: LTLState,
             decision: Decision,
             in_place: bool = False) -> ParsingState:
        if in_place:
            copy = state
        else:
            copy = state.copy()
        copy.step += 1

        if state.stack:
            position = decision.position

            if (position in copy.seen and not self.pop_with_0) or \
                    (position == 0 and self.pop_with_0) or copy.step == 2: #second step is always pop
                tos = copy.stack.pop()

                if not self.pop_with_0:
                    assert tos == position

                if tos != 0:
                    copy.constants[tos - 1] = decision.supertag
                    lexical_type_tos = self.read_cache.parse_str(
                        decision.supertag[1])
                    copy.lexical_types[tos - 1] = lexical_type_tos
                    copy.lex_labels[tos - 1] = decision.lexlabel

                    #now determine term types of children
                    for child_id in state.children[tos]:  # 1-based children
                        child_id -= 1  # 0-based children
                        label = copy.edge_labels[child_id]

                        copy.applysets_collected[child_id] = set()

                        if label.startswith("APP_"):
                            # get request at source
                            source = label.split("_")[1]
                            req = lexical_type_tos.get_request(source)
                            copy.term_types[child_id] = {req}

                        elif label.startswith("MOD_"):
                            source = label.split("_")[1]
                            copy.term_types[child_id] = set(
                                self.mod_cache.get_modifiers_with_source(
                                    lexical_type_tos, source))
                        else:
                            raise ValueError(
                                "Somehow the invalid edge label " + label +
                                " was produced")

                if self.reverse_push_actions:
                    copy.stack.extend(copy.substack)
                else:
                    copy.stack.extend(reversed(copy.substack))
                copy.substack = []
            else:
                tos = copy.stack[-1]
                copy.heads[position - 1] = tos

                assert position <= len(copy.sentence)
                copy.children[tos].append(position)  # 1-based

                copy.edge_labels[position - 1] = decision.label

                if decision.label.startswith("APP_"):
                    source = decision.label.split("_")[1]
                    copy.applysets_collected[copy.active_node - 1].add(source)
                    smallest_apply_set = np.inf
                    for term_typ in copy.term_types[tos - 1]:
                        for lexical_type, apply_set in self.candidate_lex_types.get_candidates_with_apply_set(
                                term_typ, copy.applysets_collected[tos - 1],
                                copy.words_left +
                                len(state.applysets_collected[tos - 1])):

                            rest_of_apply_set = apply_set - copy.applysets_collected[
                                tos - 1]
                            smallest_apply_set = min(smallest_apply_set,
                                                     len(rest_of_apply_set))

                    assert smallest_apply_set < np.inf
                    copy.sources_still_to_fill[tos - 1] = smallest_apply_set

                elif decision.label == "ROOT" and not copy.root_determined:
                    copy.term_types[position - 1] = {AMType.parse_str("()")}
                    copy.applysets_collected[position - 1] = set()
                    copy.root_determined = True

                copy.words_left -= 1

                # push onto stack
                copy.substack.append(position)

            copy.seen.add(position)
            if copy.stack:
                copy.active_node = copy.stack[-1]
            else:
                copy.active_node = 0
        else:
            copy.active_node = 0

        copy.score = copy.score + decision.score
        return copy
Ejemplo n.º 9
0
    changed = False
    all_requests = set()
    for typ in all_types:
        for node in typ.nodes():
            all_requests.add(typ.get_request(node))

    for unknown_type in all_requests - all_types:
        print("Invented", invent_supertag(unknown_type))
        supertags[unknown_type] = {invent_supertag(unknown_type)}
        invented += 1
        changed = True
    all_types.update(all_requests)

# Mod sources
print("Identified the following mod sources", mod_sources)
simple_mod_types = {AMType.parse_str(f"({source})") for source in mod_sources}
for unknown_type in simple_mod_types - all_types:
    print("Invented", invent_supertag(unknown_type))
    supertags[unknown_type] = {invent_supertag(unknown_type)}
    invented += 1

all_types.update(simple_mod_types)

print("Invented", invented, "constants in total.")

# Write constants and types to files:
with open(os.path.join(args.output, "constants.txt"),"w") as f:
    for lex_type in sorted(supertags.keys(), key=lambda typ: str(typ)):
        for constant in sorted(supertags[lex_type]):
            f.write(constant)
            f.write("\n")
Ejemplo n.º 10
0
    def step(self,
             state: LTFState,
             decision: Decision,
             in_place: bool = False) -> ParsingState:
        if in_place:
            copy = state
        else:
            copy = state.copy()

        if state.stack:
            if copy.constants[state.active_node -
                              1] == ("_", "_") and state.active_node != 0:
                # first time that state.active_node has become active.
                copy.constants[state.active_node - 1] = decision.supertag
                copy.lex_labels[state.active_node - 1] = decision.lexlabel

                # Determine apply set which we have to fulfill.
                lex_type = self.read_cache.parse_str(decision.supertag[1])
                copy.lexical_types[state.active_node - 1] = lex_type
                term_type = decision.termtyp
                applyset = lex_type.get_apply_set(term_type)
                assert term_type in copy.term_types[state.active_node - 1]

                copy.term_types[state.active_node - 1] = {term_type}

                copy.applysets_todo[state.active_node - 1] = applyset

            if decision.position == 0 and self.pop_with_0:
                copy.stack.pop()
            elif not self.pop_with_0 and decision.position in copy.seen:
                popped = copy.stack.pop()
                assert popped == decision.position
            else:
                copy.heads[decision.position - 1] = copy.stack[-1]
                copy.words_left -= 1  # one word gets attached.

                copy.children[copy.stack[-1]].append(
                    decision.position)  # 1-based

                copy.edge_labels[decision.position - 1] = decision.label

                tos_lexical_type = copy.lexical_types[copy.stack[-1] - 1]
                if decision.label.startswith("APP_"):
                    source = decision.label.split("_")[1]
                    copy.applysets_todo[copy.stack[-1] - 1].remove(
                        source)  #remove obligation to fill source.

                    copy.term_types[decision.position - 1] = {
                        tos_lexical_type.get_request(source)
                    }

                elif decision.label.startswith("MOD_"):
                    source = decision.label.split("_")[1]
                    copy.term_types[decision.position - 1] = set(
                        self.mod_cache.get_modifiers_with_source(
                            tos_lexical_type,
                            source))  #TODO speed improvement?

                elif decision.label == "ROOT" and not copy.root_determined:
                    copy.term_types[decision.position -
                                    1] = {AMType.parse_str("()")}
                else:
                    raise ValueError("Edge label " + decision.label +
                                     " not allowed here.")

                # push onto stack
                copy.stack.append(decision.position)

            copy.seen.add(decision.position)

            if not copy.stack:
                copy.active_node = 0
            else:
                copy.active_node = copy.stack[-1]
        else:
            copy.active_node = 0

        copy.root_determined = True
        copy.score = copy.score + decision.score
        return copy