Example #1
0
 def nonTerminal(self):
     for prod in self.grammar:
         if is_nonterminal(prod.lhs()):
             self.non_terminal.add(prod.lhs())
             for i in range(len(prod.rhs())):
                 y = prod.rhs()[i]
                 if is_nonterminal(y):
                     self.non_terminal.add(y)
def rule_adds_atom(p):
    atoms = ['c', 'n', 'o', 's', 'f', 'cl', 'br', 'i']
    if any([x.lower() in atoms for x in p.rhs() if is_terminal(x)]) or \
        any(['valence' in x._symbol for x in p.rhs() if is_nonterminal(x)]):
        return 1
    elif any(['segment' in x._symbol for x in p.rhs() if is_nonterminal(x)]):
        return 2
    else:
        return 0
Example #3
0
def is_cnf(production):
    rhs = production.rhs()

    if len(rhs) == 1:
        return grammar.is_terminal(rhs[0])
    elif len(rhs) == 2:
        return (grammar.is_nonterminal(rhs[0])
                and grammar.is_nonterminal(rhs[1]))
    else:
        return False
Example #4
0
def _generate_one(grammar, item, depth, maxlen):
    if depth > 0 and maxlen > 0:
        if is_nonterminal(item):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth - 1,
                                          maxlen):
                    yield frag
        else:
            yield [item]

    if depth > 0 and maxlen == 0 and is_nonterminal(item):
        # has empty production
        if any(prod.rhs() == tuple() for prod in grammar.productions(item)):
            yield []
    def process_one_action(self, this_S, a):
        if a is not None:
            # 1. Apply the expansion from last prod rule
            this_rule = self.grammar.GCFG.productions()[a]
            # find the token to apply the expansion to
            for this_index, old_token in enumerate(this_S):
                if is_nonterminal(old_token['token']):
                    break
            if this_rule.lhs() != Nonterminal('Nothing'):
                new_tokens = apply_rule(this_S, this_index, this_rule, self.grammar, self.checks)#apply_rule(old_token, this_rule, self.t)
                # do the replacement
                if self.checks:
                    this_S[this_index]['children'] = new_tokens
                this_S = this_S[:this_index] + new_tokens + this_S[this_index + 1:]

        # 2. generate masks for next prod rule
        # find the index of the next token to expand, which is the first nonterminal in sequence
        for this_index, this_token in enumerate(this_S):
            if is_nonterminal(this_token['token']):
                break
            this_token = {'token': nltk.grammar.Nonterminal('Nothing')}

        # get the formal grammar mask
        self.grammar_mask = self.get_grammar_mask(this_token)

        if this_token['token'] == nltk.grammar.Nonterminal('Nothing'):
            # # we only get to this point if the sequence is fully expanded
            return this_S, self.grammar_mask


        # get the terminal distance mask
        if self.do_terminal_mask:
            term_distance = sum([x['term_dist'] for x in this_S])#sum([self.term_dist_calc(x) for x in this_S])
            steps_left = self.MAX_LEN - self.t - 1
            self.terminal_mask = np.zeros_like(self.grammar_mask)
            rule_dist = self.term_dist_calc.rule_d_term_dist(this_token)
            new_term_dist = rule_dist + term_distance
            self.terminal_mask[new_term_dist < steps_left - 1] = 1
        else:
            self.terminal_mask = np.ones_like(self.grammar_mask)

        # if we're expanding a ring numeric token
        self.ring_mask = self.get_ring_mask(this_token, this_S, this_index)

        mask = self.grammar_mask * self.terminal_mask * self.ring_mask

        if self.checks:
            assert(not all([x == 0 for x in mask]))
        return this_S, mask
Example #6
0
def is_unit(production):
    if (len(production.rhs()) == 1
            and grammar.is_nonterminal(production.rhs()[0])):

        return True
    else:
        return False
def remove_unitary_productions(cfg_grammar):
	"""
	Remove unitary-productions that aren't terminals, by making sure all 
	downstream productions get trickled up

	unfortunately, this is recursive, because you might create singletons as you're shifting things
	Note, this does NOT detect cycles
	"""
	unary = False
	productions = cfg_grammar.productions()
	for production in productions:
		if len(production) == 1:
			# Identity the first unary productions
			if is_nonterminal(production.rhs()[0]):
				unary = production
				break

	if not unary:
		# Base Case
		return cfg_grammar
	else:
		# get all productions of B, so we can make them all productions of A
		b_prods = cfg_grammar.productions(lhs=unary.rhs()[0])
		b_rhses = [b_prod.rhs() for b_prod in b_prods]

		existing_productions = [prod for prod in productions if prod != unary]
		new_productions = [Production(unary.lhs(), b_rhs) for b_rhs in b_rhses]

		new_grammar = CFG(cfg_grammar.start(), existing_productions+new_productions)
		return remove_unitary_productions(new_grammar)
Example #8
0
def children(g, parent):
    """Get Nonterminals that are used in a production or nonterminal productions

    Parameters
    ----------
    g : nltk.CFG

    parent : nltk.Production or nltk.Nonterminal

    Returns
    -------
    children : set of Nonterminal

    See Also
    --------
    nltk.CFG, nltk.Nonterminal, nltk.Production
    """
    res = set()

    if isinstance(parent, Production):
        prods = [parent]
    else:
        prods = g.productions(parent)

    for prod in prods:
        for item in prod.rhs():
            if is_nonterminal(item):
                res.add(item)

    return res
Example #9
0
def create_taskgrammar(grammar, task, encoders):
    logger.info('Creating specific grammar for task %s' % task)
    productions = grammar.productions(Nonterminal(task))
    start_token = Nonterminal('S')
    new_productions = []

    for start_production in productions:
        first_token = start_production.rhs()[0]
        if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'):
            for new_start_production in grammar.productions(first_token):
                new_productions.append(Production(start_token, new_start_production.rhs()))
        else:
            new_productions.append(Production(start_token, start_production.rhs()))

    for production in grammar.productions():
        for new_production in new_productions:
            if production.lhs() in new_production.rhs() and production not in new_productions:
                if production.lhs().symbol() == 'ENCODERS':  # Use encoders only for types of features in the dataset
                    if len(encoders) > 0:
                        new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders]))
                    else:
                        new_productions.append(Production(production.lhs(), ['E']))
                else:
                    new_productions.append(production)

    task_grammar = CFG(start_token, new_productions)

    with open(TASK_GRAMMAR_PATH, 'w') as fout:
        fout.write('\n'.join([str(x) for x in task_grammar.productions()]))

    return task_grammar
Example #10
0
    def apply(self, chart, grammar, edge):
        if edge.is_incomplete():
            return
        found = edge.lhs()
        for prod in grammar.productions(rhs=found):
            bindings = {}
            if isinstance(edge, FeatureTreeEdge):
                _next = prod.rhs()[0]
                if not is_nonterminal(_next):
                    continue

                # We rename vars here, because we don't want variables
                # from the two different productions to match.
                used_vars = find_variables((prod.lhs(), ) + prod.rhs(),
                                           fs_class=FeatStruct)
                found = found.rename_variables(used_vars=used_vars)

                result = unify(_next, found, bindings, rename_vars=False)
                if result is None:
                    continue

            new_edge = FeatureTreeEdge.from_production(
                prod, edge.start()).move_dot_forward(edge.end(), bindings)
            if chart.insert(new_edge, (edge, )):
                yield new_edge
    def apply(self, chart, grammar, left_edge, right_edge):
        # Make sure the rule is applicable.
        if not (left_edge.end() == right_edge.start() and
                left_edge.is_incomplete() and
                right_edge.is_complete() and
                isinstance(left_edge, FeatureTreeEdge)):
            return
        found = right_edge.lhs()
        nextsym = left_edge.nextsym()
        if isinstance(right_edge, FeatureTreeEdge):
            if not is_nonterminal(nextsym): return
            if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return
            # Create a copy of the bindings.
            bindings = left_edge.bindings()
            # We rename vars here, because we don't want variables
            # from the two different productions to match.
            found = found.rename_variables(used_vars=left_edge.variables())
            # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
            # generate B3 (result).
            result = unify(nextsym, found, bindings, rename_vars=False)
            if result is None: return
        else:
            if nextsym != found: return
            # Create a copy of the bindings.
            bindings = left_edge.bindings()

        # Construct the new edge.
        new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)

        # Add it to the chart, with appropriate child pointers.
        if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
            yield new_edge
    def apply(self, chart, grammar, edge):
        if edge.is_complete(): return
        nextsym, index = edge.nextsym(), edge.end()
        if not is_nonterminal(nextsym): return

        # If we've already applied this rule to an edge with the same
        # next & end, and the chart & grammar have not changed, then
        # just return (no new edges to add).
        nextsym_with_bindings = edge.next_with_bindings()
        done = self._done.get((nextsym_with_bindings, index), (None, None))
        if done[0] is chart and done[1] is grammar:
            return

        for prod in grammar.productions(lhs=nextsym):
            # If the left corner in the predicted production is
            # leaf, it must match with the input.
            if prod.rhs():
                first = prod.rhs()[0]
                if is_terminal(first):
                    if index >= chart.num_leaves(): continue
                    if first != chart.leaf(index): continue

            # We rename vars here, because we don't want variables
            # from the two different productions to match.
            if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
                new_edge = FeatureTreeEdge.from_production(prod, edge.end())
                if chart.insert(new_edge, ()):
                    yield new_edge

        # Record the fact that we've applied this rule.
        self._done[nextsym_with_bindings, index] = (chart, grammar)
Example #13
0
    def apply(self, chart, grammar, edge):
        if edge.is_incomplete():
            return
        found = edge.lhs()
        for prod in grammar.productions(rhs=found):
            bindings = {}
            if isinstance(edge, FeatureTreeEdge):
                _next = prod.rhs()[0]
                if not is_nonterminal(_next):
                    continue

                # We rename vars here, because we don't want variables
                # from the two different productions to match.
                used_vars = find_variables(
                    (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct
                )
                found = found.rename_variables(used_vars=used_vars)

                result = unify(_next, found, bindings, rename_vars=False)
                if result is None:
                    continue

            new_edge = FeatureTreeEdge.from_production(
                prod, edge.start()
            ).move_dot_forward(edge.end(), bindings)
            if chart.insert(new_edge, (edge,)):
                yield new_edge
Example #14
0
    def apply(self, chart, grammar, left_edge, right_edge):
        # Make sure the rule is applicable.
        if not (left_edge.end() == right_edge.start()
                and left_edge.is_incomplete() and right_edge.is_complete()
                and isinstance(left_edge, FeatureTreeEdge)):
            return
        found = right_edge.lhs()
        nextsym = left_edge.nextsym()
        if isinstance(right_edge, FeatureTreeEdge):
            if not is_nonterminal(nextsym): return
            if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return
            # Create a copy of the bindings.
            bindings = left_edge.bindings()
            # We rename vars here, because we don't want variables
            # from the two different productions to match.
            found = found.rename_variables(used_vars=left_edge.variables())
            # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
            # generate B3 (result).
            result = unify(nextsym, found, bindings, rename_vars=False)
            if result is None: return
        else:
            if nextsym != found: return
            # Create a copy of the bindings.
            bindings = left_edge.bindings()

        # Construct the new edge.
        new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)

        # Add it to the chart, with appropriate child pointers.
        if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
            yield new_edge
Example #15
0
def is_separated(g):
    """Check if grmmar is separated

    Grammar is separated if all its productions start with a terminal

    And for all nonterminals, no two productions start with the same terminal

    Parameters
    ----------
    g : nltk.CFG

    Returns
    -------
    bool

    See Also
    --------
    nltk.CFG, nltk.Nonterminal
    """
    nonts = nonterminals(g)

    for nont in nonts:
        starts = set()
        for prod in g.productions(nont):
            start = prod.rhs()[0]

            if is_nonterminal(start):
                return False

            if start in starts:
                return False

            starts.add(start)

    return True
Example #16
0
def process_unit_productions(productions, nonterminal_dict):
    # maintain a set which is same as the production list to speed up the program
    production_set = set(productions)
    need_another_loop = 0
    to_remove_list = []
    to_add_list = []
    for p in productions:
        if len(p.rhs()) == 1 and is_nonterminal(
                p.rhs()[0]):  # A->B, B is non-terminal
            to_remove_list.append(p)
            if p.rhs()[0] not in nonterminal_dict:
                nonterminal_dict[p.rhs()[0]] = [p.lhs()]
                need_another_loop = 1
            elif p.lhs() not in nonterminal_dict[p.rhs()[0]]:
                a = nonterminal_dict[p.rhs()[0]]
                a.append(p.lhs())
                nonterminal_dict[p.rhs()[0]] = a
                need_another_loop = 1
        elif p.lhs() in nonterminal_dict:  # B->C productions
            a = nonterminal_dict[p.lhs()]  # productions with B on the left
            for item in a:  # for every A in A->B
                new_production = Production(item, p.rhs())  # A->C
                if new_production not in production_set:
                    production_set.add(new_production)  # add to the grammar
                    to_add_list.append(new_production)
                    need_another_loop = 1
    return to_add_list, nonterminal_dict, need_another_loop, to_remove_list
Example #17
0
    def apply(self, chart, grammar, edge):
        if edge.is_complete(): return
        nextsym, index = edge.nextsym(), edge.end()
        if not is_nonterminal(nextsym): return

        # If we've already applied this rule to an edge with the same
        # next & end, and the chart & grammar have not changed, then
        # just return (no new edges to add).
        nextsym_with_bindings = edge.next_with_bindings()
        done = self._done.get((nextsym_with_bindings, index), (None, None))
        if done[0] is chart and done[1] is grammar:
            return

        for prod in grammar.productions(lhs=nextsym):
            # If the left corner in the predicted production is
            # leaf, it must match with the input.
            if prod.rhs():
                first = prod.rhs()[0]
                if is_terminal(first):
                    if index >= chart.num_leaves(): continue
                    if first != chart.leaf(index): continue

            # We rename vars here, because we don't want variables
            # from the two different productions to match.
            if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
                new_edge = FeatureTreeEdge.from_production(prod, edge.end())
                if chart.insert(new_edge, ()):
                    yield new_edge

        # Record the fact that we've applied this rule.
        self._done[nextsym_with_bindings, index] = (chart, grammar)
def _expand(symbol,grammar):
	if is_nonterminal(symbol):
		rules = grammar.productions(lhs=symbol)
		probs = [r.prob() for r in rules]
		rule = choice(rules,p=probs)
		return (rule.rhs(),log(rule.prob()))
	else:
		return ((symbol,),0.0)
Example #19
0
def apply_production(sent, prod):
    res = []
    for item in sent:
        if is_nonterminal(item) and item == prod.lhs():
            res.extend(prod.rhs())
        else:
            res.append(item)

    return res
def SampleFromPCFG(grammar,start=None):
	if start==None:
		tupleSymbols = (grammar.start(),)
	else:
		tupleSymbols = (start,)
	lprob = 0.0
	while any(is_nonterminal(symbol) for symbol in tupleSymbols):
		tupleSymbols,lprob1 = _expand_seq(tupleSymbols,grammar)
		lprob += lprob1
	return (tupleSymbols,lprob)
    def apply(self, chart, grammar, edge):
        if edge.is_incomplete(): return
        for prod in grammar.productions(rhs=edge.lhs()):
            if isinstance(edge, FeatureTreeEdge):
                _next = prod.rhs()[0]
                if not is_nonterminal(_next): continue

            new_edge = FeatureTreeEdge.from_production(prod, edge.start())
            if chart.insert(new_edge, ()):
                yield new_edge
Example #22
0
    def apply(self, chart, grammar, edge):
        if edge.is_incomplete(): return
        for prod in grammar.productions(rhs=edge.lhs()):
            if isinstance(edge, FeatureTreeEdge):
                _next = prod.rhs()[0]
                if not is_nonterminal(_next): continue

            new_edge = FeatureTreeEdge.from_production(prod, edge.start())
            if chart.insert(new_edge, ()):
                yield new_edge
Example #23
0
def generate(symbol, grammar_dict):
    if symbol in grammar_dict:
        output = []
        rule = random.choice(grammar_dict[symbol])
        for element in rule:
            if grammar.is_nonterminal(element):
                output.append(generate(element.symbol(), grammar_dict))
            else:
                output.append(element)
        return ' '.join(output)
    return symbol
Example #24
0
def check_is_nonterminal(*nts):
    """
    Asserts that all of one or more objects are Nonterminals.

    :param nts: An object, which may or may not be a Nonterminal

    :return: None
    """
    for nt in nts:
        if not gr.is_nonterminal(nt):
            raise TypeError("{} must be a nonterminal".format({}))
    return
Example #25
0
def check_is_nonterminal(*nts):
    """
    Asserts that all of one or more objects are Nonterminals.

    :param nts: An object, which may or may not be a Nonterminal

    :return: None
    """
    for nt in nts:
        if not gr.is_nonterminal(nt):
            raise TypeError("{} must be a nonterminal".format({}))
    return
Example #26
0
    def _get_arg_product_rules(self, a_doc_id, a_arg, a_rel, a_parses):
        """Extract syntactic production rules for the given arg.

        Args:
          a_doc_id (str):
            id of the document
          a_arg (str):
            argument to extract productions for
          a_rel (dict):
            discourse relation to extract features for
          a_parses (dict):
            parsed sentences

        Returns:
          set:
            set of syntactic productions

        """
        ret = set()
        # obtain token indices for each arg sentence
        snt_id = None
        snt2tok = self._get_snt2tok(a_rel[a_arg][TOK_LIST])
        # obtain set of leaves corresponding to that argument
        arg_leaves = set()
        subt_leaves = set()
        processed_leaves = set()
        itree = itree_str = inode_path = None
        for snt_id, toks in snt2tok.iteritems():
            itree_str = a_parses[a_doc_id][SENTENCES][snt_id][PARSE_TREE]
            itree = Tree.fromstring(itree_str)
            if not itree.leaves():
                print("Invalid parse tree for sentence {:d}".format(snt_id),
                      file=sys.stderr)
                continue
            # obtain all terminal syntactic nodes from the arg
            for itok in toks:
                inode_path = itree.leaf_treeposition(itok)
                arg_leaves.add(itree[inode_path])
            # check all subtrees (not efficient, but easy to implement)
            for s_t in itree.subtrees():
                subt_leaves.update(s_t.leaves())
                if subt_leaves.issubset(arg_leaves) and \
                   not subt_leaves.issubset(processed_leaves):
                    ret.update(
                        str(p) for p in itree.productions() if any(
                            is_nonterminal(n) for n in p.rhs()))
                    processed_leaves.update(subt_leaves)
                subt_leaves.clear()
                if processed_leaves == arg_leaves:
                    break
            arg_leaves.clear()
            processed_leaves.clear()
        return ret
    def _get_arg_product_rules(self, a_doc_id, a_arg, a_rel, a_parses):
        """Extract syntactic production rules for the given arg.

        Args:
          a_doc_id (str):
            id of the document
          a_arg (str):
            argument to extract productions for
          a_rel (dict):
            discourse relation to extract features for
          a_parses (dict):
            parsed sentences

        Returns:
          set:
            set of syntactic productions

        """
        ret = set()
        # obtain token indices for each arg sentence
        snt_id = None
        snt2tok = self._get_snt2tok(a_rel[a_arg][TOK_LIST])
        # obtain set of leaves corresponding to that argument
        arg_leaves = set()
        subt_leaves = set()
        processed_leaves = set()
        itree = itree_str = inode_path = None
        for snt_id, toks in snt2tok.iteritems():
            itree_str = a_parses[a_doc_id][SENTENCES][snt_id][PARSE_TREE]
            itree = Tree.fromstring(itree_str)
            if not itree.leaves():
                print("Invalid parse tree for sentence {:d}".format(snt_id),
                      file=sys.stderr)
                continue
            # obtain all terminal syntactic nodes from the arg
            for itok in toks:
                inode_path = itree.leaf_treeposition(itok)
                arg_leaves.add(itree[inode_path])
            # check all subtrees (not efficient, but easy to implement)
            for s_t in itree.subtrees():
                subt_leaves.update(s_t.leaves())
                if subt_leaves.issubset(arg_leaves) and \
                   not subt_leaves.issubset(processed_leaves):
                    ret.update(str(p) for p in itree.productions()
                               if any(is_nonterminal(n)
                                      for n in p.rhs()))
                    processed_leaves.update(subt_leaves)
                subt_leaves.clear()
                if processed_leaves == arg_leaves:
                    break
            arg_leaves.clear()
            processed_leaves.clear()
        return ret
Example #28
0
def generate_production(grammar):
    """Convert CNF grammar into dictionary where keys are RHS of the
    rules/productions and values are it's (rules/productions) corresponding
    LHS.

    Args:
        grammar ([type]): Object of type "nltk.grammar.CFG " containing the CNF grammar

    Returns:
        dict: CNF grammar with all productions.
    """
    grammar_dict = {}
    for production in grammar.productions():
        rhs = production.rhs()
        if len(rhs) == 2 and is_nonterminal(rhs[0]) and is_nonterminal(rhs[1]):
            key = (rhs[0].symbol(), rhs[1].symbol())
            if key not in grammar_dict:
                grammar_dict[key] = []
            grammar_dict[key].append(production)

    return grammar_dict
Example #29
0
 def gen_frame_line(self, nt):
     sentence = ''
     prods = random.sample(self.cfg.productions(lhs=nt),len(self.cfg.productions(lhs=nt)))
     valid = True
     for prod in prods:
         #valid = True
         for sym in prod.rhs():
             if is_nonterminal(sym):
                 if len(self.cfg.productions(lhs=sym)) < 1:
                     valid = False
         if valid == True:
             for sym in prod.rhs():
                 if is_nonterminal(sym):
                     sentence += self.gen_frame_line(sym)
                 else:
                     sentence += sym + ' '
             break
     if valid == False:
         return "ERROR"
     else:
         return sentence #removed capitalize
Example #30
0
def pcky(sentence, grammar):
    tokens = word_tokenize(sentence)
    ts = '[0]'
    for i, token in enumerate(tokens):
        ts += ' ' + token + ' [{}]'.format(i + 1)
    print(ts)
    non_terminal = set([
        prod.lhs() for prod in grammar.productions()
        if is_nonterminal(prod.lhs())
    ])
    table = [[{nt: 0
               for nt in non_terminal} for i in range(len(tokens) + 1)]
             for j in range(len(tokens) + 1)]
    for i, token in enumerate(tokens):
        productions = grammar.productions(rhs=token)
        for prod in productions:
            table[i][i + 1][prod.lhs()] = prod.prob()

    for span in range(2, len(tokens) + 1):
        for start in range(len(tokens) - span + 1):
            end = start + span
            for split in range(start + 1, end):
                non_term1 = table[start][split]
                non_term2 = table[split][end]
                for nt1 in non_term1:
                    for nt2 in non_term2:
                        if non_term1[nt1] > 0 and non_term2[nt2] > 0:
                            prodlist = grammar.productions(rhs=nt1)
                            for prod in prodlist:
                                if prod.rhs() == (nt1, nt2):
                                    table[start][end][prod.lhs()] = prod.prob(
                                    ) * non_term1[nt1] * non_term2[nt2]
                                    print(
                                        '[{}] {}:({:.2f}) [{}] {}:({:.2f}) [{}] -> [{}] {}:({:.5f}) [{}]'
                                        .format(start, nt1, non_term1[nt1],
                                                split, nt2, non_term2[nt2],
                                                end, start, prod.lhs(),
                                                table[start][end][prod.lhs()],
                                                end))

    if table[0][len(tokens)][grammar.start()] > 0:
        print('The sentence is derived from the grammar')
        return True
    else:
        print('The sentence is not derived from the grammar')
        return False
Example #31
0
def pcky(sentence, grammar):
    tokens = word_tokenize(sentence)
    ts = '[0]'
    for i, token in enumerate(tokens):
        ts += ' ' + token + ' [{}]'.format(i + 1)
    print(ts)
    non_terminal = set([prod.lhs() for prod in grammar.productions() if is_nonterminal(prod.lhs())])
    table = [[{nt: 0 for nt in non_terminal} for i in range(len(tokens) + 1)] for j in range(len(tokens) + 1)]
    for i, token in enumerate(tokens):
        productions = grammar.productions(rhs=token)
        for prod in productions:
            table[i][i + 1][prod.lhs()] = prod.prob()

    for span in range(2, len(tokens) + 1):
        for start in range(len(tokens) - span + 1):
            end = start + span
            for split in range(start + 1, end):
                non_term1 = table[start][split]
                non_term2 = table[split][end]
                for nt1 in non_term1:
                    for nt2 in non_term2:
                        if non_term1[nt1] > 0 and non_term2[nt2] > 0:
                            prodlist = grammar.productions(rhs=nt1)
                            for prod in prodlist:
                                if prod.rhs() == (nt1, nt2):
                                    table[start][end][prod.lhs()] = prod.prob() * non_term1[nt1] * non_term2[nt2]
                                    print('[{}] {}:({:.2f}) [{}] {}:({:.2f}) [{}] -> [{}] {}:({:.5f}) [{}]'.format(start, nt1,
                                                                                                       non_term1[nt1],
                                                                                                       split, nt2,
                                                                                                       non_term2[nt2],
                                                                                                       end,
                                                                                                       start,
                                                                                                       prod.lhs(),
                                                                                                       table[start][end][prod.lhs()],
                                                                                                       end))

    if table[0][len(tokens)][grammar.start()] > 0:
        print('The sentence is derived from the grammar')
        return True
    else:
        print('The sentence is not derived from the grammar')
        return False
Example #32
0
def eliminate_singular_rules(prods, root_token):
    # eliminates all rules whose rhs has only one member, by substitution

    terminals, nonterminals = get_terminals_nonterminals(prods)
    # find all singular rules p with p.lhs = nt
    # check if there are any non-singular rules or whether we can eliminate nt
    singles = {nt: [] for nt in nonterminals}
    others = {nt: [] for nt in nonterminals}
    for prod in prods:
        if is_singular(prod):
            singles[prod.lhs()].append(prod)
        elif len(prod.rhs()) >= 1:
            others[prod.lhs()].append(prod)
        else:
            raise ValueError("rhs must have at least one member! " + str(prod))

    lhs_has_others = set(others)

    new_prods = prods
    # first replace all singular rules starting with root token:
    new_prods = recursively_replace_root_singulars(new_prods, root_token)

    # for each lhs with singulars:
    for lhs, these_singles in singles.items():
        new_prods = recursively_replace_lhs(
            [p for p in new_prods if p not in these_singles], lhs,
            these_singles, lhs in lhs_has_others)
        if len(these_singles) > 0:
            # after we replaced one lhs, need to index the remaining rules all over again
            break

    # check if we still have any singles left, and repeat until done
    if any([
            len(prod.rhs()) == 1 and is_nonterminal(prod.rhs()[0])
            for prod in prods
    ]):
        return eliminate_singular_rules(new_prods, root_token)
    else:
        return new_prods
Example #33
0
def endings(g, n):
    """Get right hand sides that consist only of terminals

    Parameters
    ----------
    g : nltk.CFG

    n : nltk.Nonterminal

    Returns
    -------
    children : set of Nonterminal

    See Also
    --------
    nltk.CFG, nltk.Nonterminal
    """
    res = set()

    for prod in g.productions(n):
        if all(not is_nonterminal(item) for item in prod.rhs()):
            res.add(prod.rhs())
Example #34
0
def get_symbol(element):
    if is_nonterminal(element):
        return element.symbol()
    else:
        return element
def apply_rule(S, this_index, this_rule, grammar, checks=False):
    this_token = dict(S[this_index])
    this_inner_token = this_token['token']
    # do some safety checks
    if checks:
        assert (this_inner_token == this_rule.lhs())
        if ('cycle' in this_inner_token._symbol or 'num' in this_inner_token._symbol) \
            and 'size' not in this_token:
            # 'cycle' and 'num' tokens only appear in cycles, where they are assigned ring_sizes
            raise ValueError("'cycle' and 'num' tokens only appear in cycles, where they are assigned ring_sizes")

    # get the expansion
    new_tokens = [{'token': x} for x in this_rule.rhs()]

    propagate_strings = ['cycle', 'num']
    num_nonterms = ['num', 'num1']
    # if the expansion is a new ring, assign the numeral to use
    num_map ={}
    if 'ring' in this_token['token']._symbol:
        num_id = uuid.uuid4()
        num1_id = uuid.uuid4()
        for x in new_tokens:
            if is_nonterminal(x['token']) and \
                    any([ps in x['token']._symbol for ps in propagate_strings]):
                x['size'] = 1
                if grammar is not None:
                    if x['token'] == Nonterminal('num1'): # this is very hacky, to do better want modular aromatic cycles
                        x['num'] = num1_id
                    else:
                        x['num'] = num_id
                else:
                    x['num'] = None
        else:
            this_token['num'] = None
        this_token['size'] = 0

    elif 'num' in this_token:
        if this_token['token']._symbol in ['num', 'num1']:
            # tag the resulting terminal so we know it's a cycle numeral, not a charge numeral
            for x in new_tokens:
                x['is_cycle_numeral'] = True
        # if this_token is a cycle propagation token, propagate the numeral and size counter
        for x in new_tokens:
            if is_nonterminal(x['token']) and \
                    any([ps in x['token']._symbol for ps in propagate_strings]):
                x['num'] = this_token['num']
                x['size'] = this_token['size'] + rule_adds_atom(this_rule)

    if checks:
        for x in new_tokens:
            if is_nonterminal(x['token']) and \
                    any([ps in x['token']._symbol for ps in propagate_strings]):
                assert('num' in x and 'size' in x)

    for x in new_tokens:
        try:
            x['term_dist'] = term_dist_calc(x)
        except:
            pass

    return new_tokens
Example #36
0
def check_canonical(g):
    """Check grammar for canonical rules violation

    These rules are (in simple words):

    #. Starting symbol must not appear in any rhs
    #. There should be no unproductive or unreachable nonterminals
    #. All nonterminals, except starting one, must have more than one production
    #. For all nonterminals, except starting one, all its productions must not end with the same terminal
    #. Every pair of nonterminals, except with starting one, must produce different languages
    #. For all nonterminals, except starting one, all its productions must not end with the same nonterminal

    Parameters
    ----------
    g : nltk.CFG
        Must be also separated grammar

    Returns
    -------
    broken_rules : set of int:
        Set with broken rule numbers that have been broken
        So, if this set is empty - grammar may be canonical

    See Also
    --------
    nltk.CFG, nltk.Nonterminal
    """
    if not is_separated(g):
        raise ValueError("Non-separated grammar was given")

    nonts = nonterminals(g)

    broken_rules = set()

    ends = {nont: set() for nont in nonts}
    counts = {nont: 0 for nont in nonts}

    for prod in g.productions():
        ends[prod.lhs()].add(prod.rhs()[-1])
        counts[prod.lhs()] += 1

        for item in prod.rhs():
            if item == g.start():
                broken_rules.add(1)

    for end in ends.values():
        if len(end) == 1:
            if is_nonterminal(end.pop()):
                broken_rules.add(6)
            else:
                broken_rules.add(4)

    for nont, num in counts.items():
        if nont == g.start():
            continue

        if num == 1:
            broken_rules.add(3)

    trash1 = unproductive(g)
    trash2 = unreachable(g)

    if trash1 or trash2:
        broken_rules.add(2)

    for n1, n2 in itertools.combinations(nonts, 2):
        if nonterm_equal(g, n1, n2):
            broken_rules.add(5)

    return broken_rules
def is_propagator(x):
    return is_nonterminal(x) and any(
        [ps in x._symbol for ps in propagate_strings])
Example #38
0
def min_pnet(g):
    """Generate a minimal Pnet that can be restored back to grammar

    Also calculates `t` and `h` values for a grammar
    These values are properties of the grammar, and used in restoration

    They are accessible via `graph` field of the Pnet

    Parameters
    ----------
    g : nltk.CFG

    Returns
    -------
    net : Pnet

    See Also
    --------
    pnet.Pnet, nltk.grammar
    """
    nont_sents = _minimal_different_sents(g)

    t = len(max((s for sents in nont_sents.values() for s in sents), key=len))
    nets = {nont: Pnet(sents) for nont, sents in nont_sents.items()}
    start = g.start()

    res = Pnet(prod.rhs() for prod in g.productions(start))
    res.graph['t'] = t

    completed = {start}
    change = True

    while change:
        change = False

        for (s, e, k) in list(res.edges(keys=True)):
            if is_nonterminal(k):
                if k in completed:
                    res.remove_edge(s, e, k)
                    res.insert(nets[k], s, e)
                else:
                    change = True
                    completed.add(k)
                    temp = Pnet(prod.rhs() for prod in g.productions(k))
                    res.remove_edge(s, e, k)
                    res.insert(temp, s, e)

    tree = res.subnet_tree()

    h = 0
    for subnet in tree.nodes():
        parent_start = subnet[0]
        for child_net in tree.successors(subnet):
            child_start = child_net[0]

            pathlen = len(max(nx.all_simple_edge_paths(res, parent_start, child_start), key=len))
            h = max(h, pathlen)

    res.graph['h'] = h

    return res
Example #39
0
def is_singular(prod):
    return len(prod.rhs()) == 1 and is_nonterminal(prod.rhs()[0])