def main(argv): """ Loads grammar files from command-line args then parses lines from standard input. """ parser = argparse.ArgumentParser() parser.add_argument(dest='grammars', nargs='+', help='Grammar file path(s)') parser.add_argument('--draw', dest='draw', action='store_true', help='Draw trees') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Be verbose') args = parser.parse_args(argv) grammar = load_grammars(args.grammars) parser = FeatureChartParser(grammar, trace=args.verbose, trace_chart_width=80) line = sys.stdin.readline() while line: if line[0] == '#': continue tokens = line.lower().strip().split() if len(tokens) == 0: continue trees = list(parser.parse(tokens)) print('*** {} ***'.format(tokens)) if trees: for tree in trees: print(tree.pformat(margin=80)) if args.draw: tree.draw() # print(TreePrettyPrinter(tree).text()) else: print('Could not parse {}'.format(tokens)) print('\n') line = sys.stdin.readline()
whom does Homer serve salad all the morning trains from Edinburgh to London leave before 10 most flights that serve breakfast leave at 9 some flights leave before 8 these people who live in the house are friendly Lisa claims that Bart always leaves before 8 what airlines fly from Edinburgh to London Bart laugh when do Homer drinks milk Bart laughs the kitchen does the trains leave Lisa likes drink milk Lisa and Bart likes drinking milk the morning flights from Edinburgh leave milk many flights that serves breakfast leave after 10 Bart laughs in the kitchen Bart serves milk are healthy """ sents = text.splitlines() for sent in sents: parses = uparser.parse(sent.split()) print(sent + ":") for tree in parses: print(tree) print("")
def unification_grammar(): ugrammar = FeatureGrammar.fromstring("""\ ################### RULES ################# S -> NP[NUM=?n] VP[NUM=?n] S -> PREP_P S S -> Wh_P AUX[NUM=?n] NP[NUM=?n] VP NP[NUM=?n] -> ProperNoun[NUM=?n] NP[NUM=?n] -> N[NUM=?n] | ADJ_P NP[NUM=?n] | DET[NUM=?n] NP[NUM=?n] | N[NUM=?n] PREP_P | ADJ_P NP[NUM=?n] -> ProperNoun[NUM=?n] GER_P | GER_P NP[NUM=pl] -> NP[NUM=?n] CC NP[NUM=?n] VP[SUBCAT=?rest, NUM=?n] -> V[NUM=?n, SUBCAT=?rest] | VP[NUM=?n, TENSE=?t, SUBCAT=[HEAD=?arg, TAIL=?rest]] ARG[CAT=?arg] VP[SUBCAT=?rest, NUM=?n] -> ADV_P V[NUM=?n, SUBCAT=?rest] | V[NUM=?n, SUBCAT=?rest] ADV_P VP[SUBCAT=?rest, NUM=?n] -> MOD_P VP[TENSE=?t, SUBCAT=[HEAD=?arg, TAIL=?rest]] ARG[CAT=?arg] VP[SUBCAT=?rest, NUM=?n] -> VTB[NUM=?n, SUBCAT=[HEAD=?arg, TAIL=?rest]] ARG[CAT=?arg] VP[SUBCAT=?rest, NUM=?n] -> VTB VP[SUBCAT=?rest] GER_P -> GER NP ADJ_P -> ADJ | ADJ ADJ_P ADV_P -> ADV | ADV ADV_P PREP_P -> PREP NP | PREP S MOD_P -> MOD AUX[NUM=pl] | MOD ADV AUX[NUM=pl] Wh_P -> Wh | Wh ARG[CAT=?arg] ARG[CAT=np] -> NP ARG[CAT=pp] -> PREP_P ARG[CAT=s] -> S ################# Lexicons ################# ################## NOUN ################### ########################################### ProperNoun[NUM=sg] -> 'Homer' | 'Bart' | 'Lisa' N[NUM=sg] -> 'milk' | 'salad' | 'midnight' | 'kitchen' | 'table' N[NUM=pl] -> 'shoes' | 'tables' ################# VERB #################### ########################################### ############### PRESENT ################### #########----- Intransitive -----########## V[TENSE=pres, NUM=sg, SUBCAT=nil]-> 'laughs' | 'smiles' | 'walks' | 'serves' | 'drinks' V[TENSE=pres, NUM=pl, SUBCAT=nil] -> 'laugh' | 'smile' | 'walk' | 'serve' |'drink' #########----- Transitive ------########### V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=s,TAIL=nil]] -> 'thinks' | 'believes' V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=s,TAIL=nil]] -> 'think' | 'believe' V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=np,TAIL=nil]] ->'serves' | 'drinks' | 'wears' | 'likes' V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=np,TAIL=nil]] ->'serve' | 'drink' | 'wear' | 'like' V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=pp,TAIL=nil]] ->'walks' | 'teaches' V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=pp,TAIL=nil]] ->'walk' | 'teach' ######### primary & secondary ######## V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=np, TAIL=[HEAD=np,TAIL=nil]]] -> 'serves' V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=np, TAIL=[HEAD=np,TAIL=nil]]] -> 'serve' V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=s, TAIL=[HEAD=np,TAIL=nil]]] -> 'think' | 'believe' ################# Past #################### #########----- Intransitive -----########## V[TENSE=past, SUBCAT=nil] -> 'laughed' | 'smiled' | 'walked' #########----- Transitive ------########### V[TENSE=past, SUBCAT=[HEAD=np,TAIL=nil]] -> 'drank' | 'wore' | 'served' V[TENSE=pastpart, SUBCAT=[HEAD=np,TAIL=nil]] ->'drunk' | 'worn' | 'served' | 'seen' ############### PRESENT CONT. ############# V[TENSE=prescon, FORM=prespart , SUBCAT=[HEAD=np,TAIL=nil]] -> 'drinking' | 'wearing' V[TENSE=prescon, FORM=prespart , SUBCAT=[HEAD=pp,TAIL=nil]] -> 'drinking' ################ Determiner ############### DET[NUM=sg] -> 'a' | 'the' | 'that' DET[NUM=pl] -> 'the' | 'these' | 'those' ################ Conjunction ############## CC -> 'and' ################## Modal ################## MOD -> 'may' ################# Gerund ################# GER -> 'drinking' ############ Adverb & Adjective ############ ADJ -> 'blue' | 'healthy' | 'green' | 'same' ADV -> 'always' | 'never' | 'not' | 'yesterday' ############## Preposition ################## PREP -> 'in' | 'before' | 'when' | 'on' AUX[NUM=sg] -> 'does' | 'has' AUX[NUM=pl] -> 'do' | 'have' VTB[NUM=sg] -> 'is' VTB[NUM=pl] -> 'are' Wh -> 'when' | 'what' | 'where' | 'whom' """) uparser = FeatureChartParser(ugrammar) sents = text_extended.splitlines() for sent in sents: parses = uparser.parse(sent.split()) print(sent) for tree in parses: print(tree)
class Parser: """ A language parser which is used to extract relations between entities in a given query and group related entities together. The parser uses a context free grammar based on a configuration to generate candidate entity groupings. Heuristics are then used to rank and select a grouping. This rule based parser will be helpful in many situations, but if you have a sufficiently sophisticated entity hierarchy, you may benefit from using a statistical approach. Attributes: config (dict): The parser config. """ def __init__( self, resource_loader=None, config=None, allow_relaxed=True, domain=None, intent=None, ): """Initializes the parser Args: resource_loader (ResourceLoader): An object which can load resources for the parser. config (dict, optional): The configuration for the parser. If none is provided the app config will be loaded. """ if not resource_loader and not config: raise ValueError( "Parser requires either a configuration or a resource loader") app_path = resource_loader.app_path if resource_loader else None try: entity_types = path.get_entity_types(app_path) + ["unk"] except TypeError: entity_types = {"unk"} self._resource_loader = resource_loader self.config = get_parser_config(app_path, config, domain, intent) or {} configured_entities = set() for entity_type, entity_config in self.config.items(): configured_entities.add(entity_type) configured_entities.update(entity_config.keys()) self._configured_entities = configured_entities rules = generate_grammar(self.config, entity_types) self._grammar = FeatureGrammar.fromstring(rules) self._parser = FeatureChartParser(self._grammar) if allow_relaxed: relaxed_rules = generate_grammar(self.config, entity_types, relaxed=True) self._relaxed_grammar = FeatureGrammar.fromstring(relaxed_rules) self._relaxed_parser = FeatureChartParser(self._relaxed_grammar) else: self._relaxed_grammar = None self._relaxed_parser = None def parse_entities( self, query, entities, all_candidates=False, handle_timeout=True, timeout=MAX_PARSE_TIME, ): """Determines groupings of entities for the given query. Args: query (Query): The query being parsed. entities (list[QueryEntity]): The entities to find groupings for. all_candidates (bool, optional): Whether to return all the entity candidates. handle_timeout (bool, optional): False if an exception should be raised in the event of a parsing times out. Defaults to True. timeout (float, optional): The amount of time to wait for the parsing to complete. By default this is set to MAX_PARSE_TIME. If None is passed, the passing will never time out Returns: (tuple[QueryEntity]): An updated version of the entities collection passed in with \ their parent and children attributes set appropriately. """ if not self._configured_entities: return entities if not handle_timeout: return self._parse(query, entities, all_candidates=all_candidates, timeout=timeout) try: return self._parse(query, entities, all_candidates=all_candidates, timeout=timeout) except ParserTimeout: logger.warning("Parser timed out parsing query %r", query.text) return entities def _parse(self, query, entities, all_candidates, timeout): entity_type_count = defaultdict(int) entity_dict = {} tokens = [] # tokens to be parsed # generate sentential form (assumes entities are sorted) for entity in entities: entity_type = entity.entity.type role_type = entity.entity.role if role_type: # Append role type to entity type with - separator entity_with_role_type = entity_type + "--" + role_type if entity_with_role_type in self._configured_entities: entity_type = entity_with_role_type if entity_type not in self._configured_entities: entity_type = "unk" entity_id = "{}{}".format(entity_type, entity_type_count[entity_type]) entity_type_count[entity_type] += 1 entity_dict[entity_id] = entity tokens.append(entity_id) logger.debug("Parsing sentential form: %r", " ".join(tokens)) start_time = time.time() parses = [] for parse in self._parser.parse(tokens): parses.append(parse) if timeout is not None and (time.time() - start_time) > timeout: raise ParserTimeout("Parsing took too long") if not parses and self._relaxed_parser: for parse in self._relaxed_parser.parse(tokens): parses.append(parse) if timeout is not None and (time.time() - start_time) > MAX_PARSE_TIME: raise ParserTimeout("Parsing took too long") if not parses: if all_candidates: return [] return entities ranked_parses = self._rank_parses(query, entity_dict, parses, timeout, start_time) if all_candidates: return ranked_parses # if we still have more than one, choose the first entities = self._get_flat_entities(ranked_parses[0], entities, entity_dict) return tuple(sorted(entities, key=lambda e: e.span.start)) def _rank_parses(self, query, entity_dict, parses, timeout, start_time=None): start_time = start_time or time.time() resolved = OrderedDict() for parse in parses: if timeout is not None and time.time() - start_time > timeout: raise ParserTimeout("Parsing took too long") resolved[self._resolve_parse(parse)] = None filtered = (p for p in resolved.keys()) # Prefer parses with fewer groups parses = list(sorted(filtered, key=len)) filtered = (p for p in parses if len(p) <= len(parses[0])) # Prefer parses with minimal distance from dependents to heads parses = list( sorted(filtered, key=lambda p: self._parse_distance(p, query, entity_dict))) min_parse_dist = self._parse_distance(parses[0], query, entity_dict) filtered = ( p for p in parses if self._parse_distance(p, query, entity_dict) <= min_parse_dist) # TODO: apply precedence return list(filtered) def _parse_distance(self, parse, query, entity_dict): total_link_distance = 0 stack = list(parse) while stack: node = stack.pop() head = entity_dict[node.id] for dep in node.dependents or set(): if dep.dependents: stack.append(dep) continue child = entity_dict[dep.id] if child.token_span.start > head.token_span.start: intra_entity_span = Span(head.token_span.end, child.token_span.start) else: intra_entity_span = Span(child.token_span.end, head.token_span.start) link_distance = 0 for token in intra_entity_span.slice(query.text.split(" ")): if token in self.config[node.type][ dep.type]["linking_words"]: link_distance -= 0.5 else: link_distance += 1 total_link_distance += link_distance return total_link_distance @staticmethod def _get_flat_entities(parse, entities, entity_dict): stack = [g.to_query_entity(entity_dict) for g in parse] new_dict = {} while stack: entity = stack.pop() new_dict[(entity.entity.type, entity.span.start)] = entity for child in entity.children or (): stack.append(child) return [ new_dict.get((e.entity.type, e.span.start), e) for e in entities ] @classmethod def _resolve_parse(cls, node): groups = set() for child in node: child_symbol = child.label()[TYPE_FEATURE] if child_symbol in START_SYMBOLS: groups.update(cls._resolve_parse(child)) else: group = cls._resolve_group(child).freeze() groups.add(group) return frozenset(groups) @classmethod def _resolve_group(cls, node): symbol = node.label()[TYPE_FEATURE] if not symbol[0].isupper(): # this node is a generic entity of type {symbol}, its child is the terminal return _EntityNode(symbol, node[0], None) # if first char is capitalized, this is a group! group_type = symbol.lower() dependents = set() for child in node: child_symbol = child.label()[TYPE_FEATURE] if child_symbol == symbol: # this is the ancestor of this group group = cls._resolve_group(child) elif child_symbol == group_type: # this is the root ancestor of this group group = cls._resolve_group(child) group = _EntityNode(group.type, group.id, set()) else: dependents.add(cls._resolve_group(child).freeze()) group.dependents.update(dependents) return group