def get_order(self, sentence: AMSentence) -> Iterable[Decision]: t = Tree.from_am_sentence(sentence) term_types = get_term_types(sentence) def _construct_seq(tree: Tree, is_first_child: bool, parent_type: Tuple[str, str], parent_lex_label: str, parent_term_type: AMType) -> List[Decision]: own_position = tree.node[0] to_left = [] to_right = [] for child in tree.children: if child.node[1].label == "IGNORE": continue if child.node[0] < own_position: to_left.append(child) else: to_right.append(child) if is_first_child: beginning = [ Decision(own_position, False, tree.node[1].label, parent_type, parent_lex_label, parent_term_type) ] else: beginning = [ Decision(own_position, False, tree.node[1].label, ("", ""), "") ] if self.children_order == "LR": children = to_left + to_right elif self.children_order == "IO": children = list(reversed(to_left)) + to_right else: raise ValueError("Unknown children order: " + self.children_order) ret = beginning for i, child in enumerate(children): ret.extend( _construct_seq(child, i == 0, (tree.node[1].fragment, tree.node[1].typ), tree.node[1].lexlabel, term_types[own_position - 1])) last_position = 0 if self.pop_with_0 else own_position if len(tree.children) == 0: #This subtree has no children, thus also no first child at which we would determine the type of the parent #Let's determine the type now. last_decision = Decision( last_position, True, "", (tree.node[1].fragment, tree.node[1].typ), tree.node[1].lexlabel, term_types[own_position - 1]) else: last_decision = Decision(last_position, True, "", ("", ""), "") ret.append(last_decision) return ret return _construct_seq(t, False, ("", ""), "", AMType.parse_str("_"))
def typ2i(additional_lexicon: AdditionalLexicon) -> Dict[AMType, int]: _typ2i: Dict[AMType, int] = dict() for typ, i in additional_lexicon.sublexica["term_types"]: try: _typ2i[AMType.parse_str(typ)] = i except NonAMTypeException: pass return _typ2i
def normalize_types(self) -> "AMSentence": """ Parse the types and convert them to strings again to normalize them. :return: """ return AMSentence([ Entry(word.token, word.replacement, word.lemma, word.pos_tag, word.ner_tag, word.fragment, word.lexlabel, str(AMType.parse_str(word.typ)), word.head, word.label, word.aligned, word.range) for i, word in enumerate(self.words) ], self.attributes)
def typ2supertag(lexicon: AdditionalLexicon) -> Dict[AMType, Set[int]]: _typ2supertag: Dict[ AMType, Set[int]] = dict() #which supertags have the given type? for supertag, i in lexicon.sublexica["constants"]: _, typ = AMSentence.split_supertag(supertag) try: typ = AMType.parse_str(typ) if typ not in _typ2supertag: _typ2supertag[typ] = set() _typ2supertag[typ].add(i) except NonAMTypeException: print("Skipping type", typ) return _typ2supertag
def initial_state(self, sentence: AMSentence, decoder_state: Any) -> ParsingState: stack = [0] seen = set() heads = [0 for _ in range(len(sentence))] children = {i: [] for i in range(len(sentence) + 1)} labels = ["IGNORE" for _ in range(len(sentence))] lex_labels = ["_" for _ in range(len(sentence))] supertags = [("_", "_") for _ in range(len(sentence))] lexical_types = [AMType.parse_str("_") for _ in range(len(sentence))] term_types = [set() for _ in range(len(sentence))] applysets_todo = [None for _ in range(len(sentence))] return LTFState(decoder_state, 0, 0.0, sentence, self.additional_lexicon, heads, children, labels, supertags, lex_labels, stack, seen, lexical_types, term_types, applysets_todo, len(sentence), False)
def initial_state(self, sentence: AMSentence, decoder_state: Any) -> ParsingState: stack = [0] seen = set() substack = [] heads = [0 for _ in range(len(sentence))] children = {i: [] for i in range(len(sentence) + 1)} labels = ["IGNORE" for _ in range(len(sentence))] lex_labels = ["_" for _ in range(len(sentence))] constants = [("_", "_") for _ in range(len(sentence))] lexical_types = [AMType.parse_str("_") for _ in range(len(sentence))] term_types = [None for _ in range(len(sentence))] applysets_collected = [None for _ in range(len(sentence))] return LTLState(decoder_state, 0, 0.0, sentence, self.additional_lexicon, heads, children, labels, constants, lex_labels, stack, seen, substack, lexical_types, term_types, applysets_collected, len(sentence), False, [0 for _ in sentence])
def __init__(self, children_order: str, pop_with_0: bool, additional_lexicon: AdditionalLexicon, reverse_push_actions: bool = False, enable_assert : bool = False): """ Select children_order : "LR" (left to right) or "IO" (inside-out, recommended by Ma et al.) reverse_push_actions means that the order of push actions is the opposite order in which the children of the node are recursively visited. """ super().__init__(children_order, pop_with_0, additional_lexicon, reverse_push_actions) self.enable_assert = enable_assert self.i2source = sorted({label.split("_")[1] for label, _ in self.additional_lexicon.sublexica["edge_labels"] if "_" in label}) self.source2i = {s: i for i, s in enumerate(self.i2source)} len_sources = len(self.i2source) self.additional_apps = ["APP_" + source for source in self.i2source if not self.additional_lexicon.contains("edge_labels", "APP_" + source)] self.additional_lexicon.sublexica["edge_labels"].add(self.additional_apps) len_labels = self.additional_lexicon.vocab_size("edge_labels") all_lex_types = {AMSentence.split_supertag(lextyp)[1] for lextyp, _ in self.additional_lexicon.sublexica["constants"] if "--TYPE--" in lextyp} self.i2lextyp = sorted(all_lex_types) self.lextyp2i : Dict[AMType, int] = { AMType.parse_str(l) : i for i, l in enumerate(self.i2lextyp)} len_lex_typ = len(self.i2lextyp) lexical2constant = np.zeros((len_lex_typ, self.additional_lexicon.vocab_size("constants")), dtype=np.bool) #shape (lexical type, constant) constant2lexical = np.zeros(self.additional_lexicon.vocab_size("constants"), dtype=np.long) get_term_types = np.zeros((len_lex_typ, len_labels, len_lex_typ), dtype=np.bool) #shape (parent lexical type, incoming label, term type) applyset_term_types = np.zeros((len_lex_typ, len_lex_typ, len_sources), dtype=np.bool) # shape (TERM TYPE, lexical type, source) apply_reachable_term_types = np.zeros((len_lex_typ, len_lex_typ), dtype=np.bool) # shape (TERM type, lexial type) #self.mod_cache = ModCache([AMType.parse_str(t) for t in all_lex_types]) for constant,constant_id in self.additional_lexicon.sublexica["constants"]: lex_type = AMType.parse_str(AMSentence.split_supertag(constant)[1]) lexical2constant[self.lextyp2i[lex_type], constant_id] = 1 constant2lexical[constant_id] = self.lextyp2i[lex_type] apply_reachable_from : Dict[AMType, Set[Tuple[AMType, frozenset]]] = dict() for t1 in self.lextyp2i.keys(): if t1.is_bot: continue for t2 in self.lextyp2i.keys(): if t2.is_bot: continue applyset = t1.get_apply_set(t2) if applyset is not None: if t2 not in apply_reachable_from: apply_reachable_from[t2] = set() apply_reachable_from[t2].add((t1, frozenset(applyset))) root_id = self.additional_lexicon.get_id("edge_labels", "ROOT") for parent_lex_typ, parent_id in self.lextyp2i.items(): # ROOT # root requires empty term type, thus all sources must be removed get_term_types[parent_id, root_id, parent_id] = True for current_lex_type in self.lextyp2i.keys(): if current_lex_type.is_bot: continue current_typ_id = self.lextyp2i[current_lex_type] apply_reachable_term_types[current_typ_id, current_typ_id] = True # MOD for source, t in self.mod_cache.get_modifiers(parent_lex_typ): smallest_apply_set : Dict[Tuple[int, str], Set[str]] = dict() if self.additional_lexicon.contains("edge_labels", "MOD_"+source): label_id = self.additional_lexicon.get_id("edge_labels", "MOD_"+source) get_term_types[parent_id, label_id, self.lextyp2i[t]] = True for possible_lexical_type, applyset in apply_reachable_from[t]: current_typ_id = self.lextyp2i[possible_lexical_type] apply_reachable_term_types[self.lextyp2i[t], current_typ_id] = True for source in applyset: source_id = self.source2i[source] applyset_term_types[self.lextyp2i[t], current_typ_id, source_id] = 1 # APP for source in parent_lex_typ.nodes(): req = parent_lex_typ.get_request(source) label_id = self.additional_lexicon.get_id("edge_labels", "APP_"+source) get_term_types[parent_id, label_id, self.lextyp2i[req]] = True for possible_lexical_type, applyset in apply_reachable_from[req]: current_typ_id = self.lextyp2i[possible_lexical_type] apply_reachable_term_types[self.lextyp2i[req], current_typ_id] = True for source in applyset: source_id = self.source2i[source] applyset_term_types[self.lextyp2i[req], current_typ_id, source_id] = 1 self.lexical2constant = torch.from_numpy(lexical2constant) self.constant2lexical = torch.from_numpy(constant2lexical) self.app_source2label_id = torch.zeros((len_sources, len_labels), dtype=torch.bool) # maps a source id to the respective (APP) label id self.mod_tensor = torch.zeros(len_labels, dtype=torch.bool) #which label ids are MOD_ edge labels? self.label_id2appsource = torch.zeros(len_labels, dtype=torch.long)-1 self.applyset_term_types = torch.from_numpy(applyset_term_types) # shape (TERM TYPE, lexical type, source); is the given source in the apply set from the lexical type to the term type? self.get_term_types = torch.from_numpy(get_term_types) #shape (parent lexical type, incoming label, term type) self.apply_reachable_term_types = torch.from_numpy(apply_reachable_term_types) # shape (TERM type, lexial type) for label, label_id in self.additional_lexicon.sublexica["edge_labels"]: if label.startswith("MOD_"): self.mod_tensor[label_id] = True for source, source_id in self.source2i.items(): label_id = self.additional_lexicon.get_id("edge_labels", "APP_"+source) self.label_id2appsource[label_id] = source_id self.app_source2label_id[source_id, label_id] = True
def step(self, state: LTLState, decision: Decision, in_place: bool = False) -> ParsingState: if in_place: copy = state else: copy = state.copy() copy.step += 1 if state.stack: position = decision.position if (position in copy.seen and not self.pop_with_0) or \ (position == 0 and self.pop_with_0) or copy.step == 2: #second step is always pop tos = copy.stack.pop() if not self.pop_with_0: assert tos == position if tos != 0: copy.constants[tos - 1] = decision.supertag lexical_type_tos = self.read_cache.parse_str( decision.supertag[1]) copy.lexical_types[tos - 1] = lexical_type_tos copy.lex_labels[tos - 1] = decision.lexlabel #now determine term types of children for child_id in state.children[tos]: # 1-based children child_id -= 1 # 0-based children label = copy.edge_labels[child_id] copy.applysets_collected[child_id] = set() if label.startswith("APP_"): # get request at source source = label.split("_")[1] req = lexical_type_tos.get_request(source) copy.term_types[child_id] = {req} elif label.startswith("MOD_"): source = label.split("_")[1] copy.term_types[child_id] = set( self.mod_cache.get_modifiers_with_source( lexical_type_tos, source)) else: raise ValueError( "Somehow the invalid edge label " + label + " was produced") if self.reverse_push_actions: copy.stack.extend(copy.substack) else: copy.stack.extend(reversed(copy.substack)) copy.substack = [] else: tos = copy.stack[-1] copy.heads[position - 1] = tos assert position <= len(copy.sentence) copy.children[tos].append(position) # 1-based copy.edge_labels[position - 1] = decision.label if decision.label.startswith("APP_"): source = decision.label.split("_")[1] copy.applysets_collected[copy.active_node - 1].add(source) smallest_apply_set = np.inf for term_typ in copy.term_types[tos - 1]: for lexical_type, apply_set in self.candidate_lex_types.get_candidates_with_apply_set( term_typ, copy.applysets_collected[tos - 1], copy.words_left + len(state.applysets_collected[tos - 1])): rest_of_apply_set = apply_set - copy.applysets_collected[ tos - 1] smallest_apply_set = min(smallest_apply_set, len(rest_of_apply_set)) assert smallest_apply_set < np.inf copy.sources_still_to_fill[tos - 1] = smallest_apply_set elif decision.label == "ROOT" and not copy.root_determined: copy.term_types[position - 1] = {AMType.parse_str("()")} copy.applysets_collected[position - 1] = set() copy.root_determined = True copy.words_left -= 1 # push onto stack copy.substack.append(position) copy.seen.add(position) if copy.stack: copy.active_node = copy.stack[-1] else: copy.active_node = 0 else: copy.active_node = 0 copy.score = copy.score + decision.score return copy
changed = False all_requests = set() for typ in all_types: for node in typ.nodes(): all_requests.add(typ.get_request(node)) for unknown_type in all_requests - all_types: print("Invented", invent_supertag(unknown_type)) supertags[unknown_type] = {invent_supertag(unknown_type)} invented += 1 changed = True all_types.update(all_requests) # Mod sources print("Identified the following mod sources", mod_sources) simple_mod_types = {AMType.parse_str(f"({source})") for source in mod_sources} for unknown_type in simple_mod_types - all_types: print("Invented", invent_supertag(unknown_type)) supertags[unknown_type] = {invent_supertag(unknown_type)} invented += 1 all_types.update(simple_mod_types) print("Invented", invented, "constants in total.") # Write constants and types to files: with open(os.path.join(args.output, "constants.txt"),"w") as f: for lex_type in sorted(supertags.keys(), key=lambda typ: str(typ)): for constant in sorted(supertags[lex_type]): f.write(constant) f.write("\n")
def step(self, state: LTFState, decision: Decision, in_place: bool = False) -> ParsingState: if in_place: copy = state else: copy = state.copy() if state.stack: if copy.constants[state.active_node - 1] == ("_", "_") and state.active_node != 0: # first time that state.active_node has become active. copy.constants[state.active_node - 1] = decision.supertag copy.lex_labels[state.active_node - 1] = decision.lexlabel # Determine apply set which we have to fulfill. lex_type = self.read_cache.parse_str(decision.supertag[1]) copy.lexical_types[state.active_node - 1] = lex_type term_type = decision.termtyp applyset = lex_type.get_apply_set(term_type) assert term_type in copy.term_types[state.active_node - 1] copy.term_types[state.active_node - 1] = {term_type} copy.applysets_todo[state.active_node - 1] = applyset if decision.position == 0 and self.pop_with_0: copy.stack.pop() elif not self.pop_with_0 and decision.position in copy.seen: popped = copy.stack.pop() assert popped == decision.position else: copy.heads[decision.position - 1] = copy.stack[-1] copy.words_left -= 1 # one word gets attached. copy.children[copy.stack[-1]].append( decision.position) # 1-based copy.edge_labels[decision.position - 1] = decision.label tos_lexical_type = copy.lexical_types[copy.stack[-1] - 1] if decision.label.startswith("APP_"): source = decision.label.split("_")[1] copy.applysets_todo[copy.stack[-1] - 1].remove( source) #remove obligation to fill source. copy.term_types[decision.position - 1] = { tos_lexical_type.get_request(source) } elif decision.label.startswith("MOD_"): source = decision.label.split("_")[1] copy.term_types[decision.position - 1] = set( self.mod_cache.get_modifiers_with_source( tos_lexical_type, source)) #TODO speed improvement? elif decision.label == "ROOT" and not copy.root_determined: copy.term_types[decision.position - 1] = {AMType.parse_str("()")} else: raise ValueError("Edge label " + decision.label + " not allowed here.") # push onto stack copy.stack.append(decision.position) copy.seen.add(decision.position) if not copy.stack: copy.active_node = 0 else: copy.active_node = copy.stack[-1] else: copy.active_node = 0 copy.root_determined = True copy.score = copy.score + decision.score return copy