Example #1
0
def create_taskgrammar(grammar, task, encoders):
    logger.info('Creating specific grammar for task %s' % task)
    productions = grammar.productions(Nonterminal(task))
    start_token = Nonterminal('S')
    new_productions = []

    for start_production in productions:
        first_token = start_production.rhs()[0]
        if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'):
            for new_start_production in grammar.productions(first_token):
                new_productions.append(Production(start_token, new_start_production.rhs()))
        else:
            new_productions.append(Production(start_token, start_production.rhs()))

    for production in grammar.productions():
        for new_production in new_productions:
            if production.lhs() in new_production.rhs() and production not in new_productions:
                if production.lhs().symbol() == 'ENCODERS':  # Use encoders only for types of features in the dataset
                    if len(encoders) > 0:
                        new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders]))
                    else:
                        new_productions.append(Production(production.lhs(), ['E']))
                else:
                    new_productions.append(production)

    task_grammar = CFG(start_token, new_productions)

    with open(TASK_GRAMMAR_PATH, 'w') as fout:
        fout.write('\n'.join([str(x) for x in task_grammar.productions()]))

    return task_grammar
def get_all_pronouns(tree):
    pronouns_available = []
    for production in tree.productions():
        if (production._lhs == Nonterminal('PRP') or production._lhs
                == Nonterminal('PossPro')) and type(production._rhs[0]) is str:
            pronouns_available.append(production._rhs[0])
    return pronouns_available
Example #3
0
def reinsert_unary_chains(tree, old_grammar):
  old_unary_productions = [p for p in old_grammar.productions() if len(p) == 1 and p.is_nonlexical()]

  nodeList = [tree]
  while nodeList != []:
    node = nodeList.pop()
    if not isinstance(node, Tree):
      continue
    
    assert len(node) <= 2

    nodeCopy = node.copy()
    children_rhs = [Nonterminal(child.label()) if not isinstance(child, str) else child for child in node]

    possibilities = []
    possibility = [Nonterminal(node.label())]
    query = Production(possibility[-1], children_rhs)
    while query not in old_grammar.productions():
      new_possibilities = [possibility + [p.rhs()[0]] for p in old_unary_productions if p.lhs() == possibility[-1]]
      possibilities.extend(new_possibilities)
      possibility = possibilities.pop(0)
      query = Production(possibility[-1], children_rhs)
      
    # Once a chain has been found, add it back in:
    node[0:] = [] # remove children
    lastnode = node
    for nt in possibility[1:]:
      newnode = Tree(nt.symbol(), [])
      lastnode[0:] = [newnode]
      lastnode = newnode
    lastnode[0:] = [child for child in nodeCopy]

    for child in lastnode:
      nodeList.append(child)
Example #4
0
def main(args):

    sentence = args.sentence.lower()
    args.sentence = sentence
    tokens = sentence.split()
    grammar = loadGrammar(args)
    nonterm = getnonterm(grammar)
    terminalProductionRules = getTerminalProbability(args, grammar, nonterm)
    HSrules = grammar.productions(Nonterminal('HS'))
    for rule in HSrules:
        grammar.productions().remove(rule)

    ESrules = grammar.productions(Nonterminal('ES'))
    for rule in ESrules:
        grammar.productions().remove(rule)

    grammar.productions().extend(terminalProductionRules)

    for token in tokens:
        grammar.productions().append(
            ProbabilisticProduction(Nonterminal(token.upper()),
                                    [unicode(token)],
                                    prob=1))

    #print "Grammars"
    grammarlist = str(grammar).split('\n')[1:]

    #print "Transfered"
    strgrammar = ''
    for p in grammar.productions():
        rhs = p.rhs()
        rhsstr = ''
        for r in rhs:
            if is_terminal(r):
                rhsstr += '\'' + str(r) + '\' '
            else:
                rhsstr += str(r) + ' '
        strgrammar += str(p.lhs()) + ' -> ' + rhsstr + ' [' + '{0:.8f}'.format(
            p.prob()) + ']\n'
    #print strgrammar

    grammar = PCFG.fromstring(strgrammar.split('\n'))
    #'''
    #grammar = loadGrammar(args)

    #tokens = args.sentence.lower().split()
    #nonterm = getnonterm(grammar)

    CYK(tokens, nonterm, grammar)
    #with open(args.grammar_file, 'r') as f:
    #        content = f.read()

    #trees = corpus2trees(content)
    #productions = trees2productions(trees)
    #listnonterm = []
    #grammar = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal('SS'), productions)
    #print grammar

    #'''
    '''
Example #5
0
    def test_production_from_grammar(self):
        grammar_str = """
        S -> NP VP
        PP -> P NP
        NP -> Det N | NP PP
        VP -> V NP | VP PP
        Det -> 'a' | 'the'
        N -> 'dog' | 'cat'
        V -> 'chased' | 'sat'
        P -> 'on' | 'in'
        """

        grammar = parse_cfg(grammar_str)
        productions = grammar.productions()

        expect_production = Production(
            lhs=Nonterminal("S"), rhs=[Nonterminal("NP"),
                                       Nonterminal("VP")])
        error_msg = "Expect to find '{}', but can not see in \n{}".format(
            expect_production, grammar_str)
        self.assertIn(expect_production, productions, error_msg)

        expect_production = Production(lhs=Nonterminal("N"), rhs=['dog'])
        error_msg = "Expect to find '{}', but can not see in \n{}".format(
            expect_production, grammar_str)
        self.assertIn(expect_production, productions, error_msg)

        expect_not_in = Production(lhs="S", rhs=["NP", "VP"])
        self.assertNotIn(expect_not_in, productions, error_msg)

        expect_not_in = Production(lhs=Nonterminal("N"), rhs=["'dog'"])
        self.assertNotIn(expect_not_in, productions, error_msg)
Example #6
0
    def traverse(self, node):
        assert (self.grammar != None)
        prob = 0.0
        length = 0
        if node.height() == 2:
            return (prob, length)
        lhs = Nonterminal(node.label())
        productions = self.grammar.productions(lhs)

        #find the productions from
        flag = False
        rhs_list = []
        for c in node:
            rhs_list.append(Nonterminal(c.label()))
        tuple_rhs = tuple(rhs_list)
        for p in productions:
            if p.lhs() == lhs and p.rhs() == tuple_rhs:
                flag = True
                prob += math.log(p.prob())
                break
        if not flag:
            prob += math.log(eps)
        length += 1
        for c in node:
            ret = self.traverse(c)
            prob += ret[0]
            length += ret[1]
        return (prob, length)
Example #7
0
    def test_current_production(self):
        inputs_ = [("""
                (S
                    (sentence
                        (type_1_sentence_coord_1
                        (type_1_sentence_coord_2
                            (type_2_sentence
                            (THERE There)
                            (AUX is)
                            (Noun_Phrase
                                (det (DET an))
                                (Noun_w_support
                                (Adj_phrase
                                    (Adj_core (JJ small))
                                    (AND and)
                                    (Adj_phrase (Adj_core (JJ red))))
                                (Noun_Count (NN apple)))))))
                        (PERIOD .)))
                """, Production(Nonterminal("S"), [Nonterminal("sentence")]))]

        for i, (input_, expect_) in enumerate(inputs_):
            tree = Tree.parse(input_)
            production = current_production(tree)

            self.assertEqual(expect_, production)
def generate_grammar_and_parsers(parsed_sents):
    # From sentences, extract the parsing tree and transform each tree to a list of CFG productions;
    # generate a set containing all the productions (without repetitions)
    tbank_productions_with_repet = [
        production for parsed_sent in parsed_sents
        for production in parsed_sent.productions()
    ]
    tbank_productions = set(
        tbank_productions_with_repet)  # exclude repetitions
    print("Num. of unique productions read:", len(tbank_productions))

    # Build a CFG from the productions
    print("\nBuinding a CFG...")
    cfg_grammar = CFG(Nonterminal('S'), tbank_productions)  # a CFG
    print(cfg_grammar, end="\n\n")

    # CFG - An Earley parser
    cfg_earley_parser = EarleyChartParser(cfg_grammar, trace=3)
    # Build a PCFG from the productions

    print("Building a PCFG...")
    pcfg_grammar = induce_pcfg(
        Nonterminal('S'),
        tbank_productions_with_repet)  # a PCFG, here repetitions are needed!
    print(pcfg_grammar, end="\n\n")

    # Allocate a bottom-up chart parser for PCFG; see: http://www.nltk.org/_modules/nltk/parse/pchart.html
    pcfg_pchart_parser = InsideChartParser(pcfg_grammar)

    return cfg_earley_parser, pcfg_pchart_parser  # return both parsers
Example #9
0
 def Parse(self, sent):
     """ Implement the CKY algorithm for PCFGs, populating the dynamic programming 
     table with log probabilities of every constituent spanning a sub-span of a given 
     test sentence (i, j) and storing the appropriate back-pointers. 
     """
     sent.append(" ")
     dynamic_table = defaultdict(float)
     backpointers = defaultdict(tuple)
     #for j,token in enumerate(sent):
     for j in range(0,len(sent)):
         for rule in self._r2l_lex[(sent[j],)]:
             dynamic_table[(j,j+1,rule.lhs())] = log(rule.prob())
         
         for i in range(j-1,-1,-1):
             for k in range(i+1, j):
                 newlist1 = []
                 newlist2 = []
                 for key in dynamic_table.keys():
                     if key[0] == i and key[1] == k:
                         newlist1.append(key[2])
                     if key[0] == k and key[1] == j:
                         newlist2.append(key[2])
                 for b in newlist1:
                     for c in newlist2:
                         rulelist = self._r2l[(b,c)]
                         for rule in rulelist:
                             if (i,j,rule.lhs()) not in dynamic_table.keys() or dynamic_table[(i,j,rule.lhs())] < log(rule.prob()) + dynamic_table[(i,k,rule.rhs()[0])] + dynamic_table[(k,j,rule.rhs()[1])]:
                                 dynamic_table[(i,j,rule.lhs())] = log(rule.prob()) + dynamic_table[(i,k,rule.rhs()[0])] + dynamic_table[(k,j,rule.rhs()[1])]
                                 backpointers[(i,j,rule.lhs())] = (k,rule.rhs()[0],rule.rhs()[1])
     if sent == ["Terms", "were", "n't", "disclosed", ".", " "]:
         print dynamic_table[(0,len(sent)-1,Nonterminal("S"))]
     return self.BuildTree(dynamic_table,sent,backpointers,(0,len(sent)-1,Nonterminal("S")))
 def BuildTree(cky_table, sent):
     n = len(sent)
     if Nonterminal("S") not in cky_table[0][n - 1].keys():
         # print "not start with S"
         return None
     else:
         tree = BuildTreeHelper(cky_table, sent, 0, n - 1, Nonterminal("S"))
         return tree
Example #11
0
    def convert(tree):
        # convert from ntlk.tree.Tree to our AnnotatedTree

        if isinstance(tree, nltk.tree.Tree):
            symbol = Nonterminal(tree.label())
            children = list(convert(_) for _ in tree)
            rule = Production(Nonterminal(tree.label()), _child_names(tree))
            rule_selection_id = _find_rule_selection_id(rule)
            return AnnotatedTree(symbol=symbol, children=children, rule=rule, rule_selection_id=rule_selection_id)
        else:
            return AnnotatedTree(symbol=tree)
def terminal_distance(grammar, x):
    # due to masking that enforces minimal ring length, must override term distances derived purely from grammar
    if x['token'] == Nonterminal('aliphatic_ring'):
        return 8
    elif x['token'] == Nonterminal('cycle_bond'):
        return max(2, 7 - x['ring_size'])
    elif x['token'] == Nonterminal('cycle_double_bond'):
        # need to go at least to cycle_bond -> num1 -> number
        return max(3, 7 - x['ring_size'])
    else:
        return grammar.terminal_dist(x['token'])
Example #13
0
def str2production(str):
    prod_split = str.partition('->')
    nltk_lhs = Nonterminal(prod_split[0].strip())
    nltk_rhs = [Nonterminal(e.strip()) for e in prod_split[2].split()]
    nltk_tree = nltk.grammar.Production(nltk_lhs, nltk_rhs)

    lhs = prod_split[0].strip()
    rhs = [e.strip() for e in prod_split[2].split()]
    # return super().__init__(lhs, rhs)
    # return Production(lhs, rhs)
    return Production(nltk_tree, lhs, rhs)
Example #14
0
File: pset4.py Project: limz10/NLP
 def BuildTree(cky_table, sent):
     """ Build a tree by following the back-pointers starting from the largest span
         (0, len(sent)) and recursing from larger spans (i, j) to smaller sub-spans
         (i, k), (k, j) and eventually bottoming out at the preterminal level (i, i+1).
     """
     if Nonterminal('S') not in cky_table[(0, len(sent))]:
         return None
     else:
         return InvertedGrammar.recursive_build(cky_table, sent,
                                                Nonterminal("S"), 0,
                                                len(sent))
    def __init__(self, grammar=grammar_zinc_new, checks=False):
        # self.mask_gen = get_mask_gen()
        # self.mask_gen.do_terminal_mask = False
        self.term_dist = {}
        self.d_term_dist = {}
        self.grammar = grammar
        self.GCFG = self.grammar.GCFG
        self.checks = checks

        for p in self.GCFG.productions():
            for s in p.rhs():
                if is_terminal(s):
                    # terminals have term distance 0
                    self.term_dist[frozendict({'token': s})] = 0

        self.term_dist[frozendict({'token': Nonterminal('None')})] = 0

        # seed the search with the root symbol
        self.term_dist[frozendict({'token': Nonterminal('smiles')})] = float('inf')

        while True: # iterate to convergence
            # print('*** and one more pass... ***')
            last_term_dist = copy.copy(self.term_dist)
            for sym in last_term_dist.keys():
                if is_terminal(sym['token']):
                    self.term_dist[sym] = 0
                if self.term_dist[sym] > 0:
                    mask = self.get_mask_from_token(sym)
                    # [p for ip, p in enumerate(self.GCFG.productions()) if mask[ip]]
                    if self.checks:
                        assert (not all([x == 0 for x in mask]))
                    for ip, p in enumerate(self.GCFG.productions()):
                        if mask[ip]:
                            # print('trying', sym, p)
                            this_exp = apply_rule([sym], 0, p, None, self.checks)
                            this_term_dist = 1
                            for this_sym in this_exp:
                                if frozendict(this_sym) not in self.term_dist:
                                    self.term_dist[frozendict(this_sym)] = float('inf')
                                    print('added ', this_sym, 'from', sym, 'via', p)
                                    # if 'ring_size' in sym and sym['ring_size'] > 6:
                                    #     print('aaa')
                                this_term_dist += self.term_dist[frozendict(this_sym)]
                            if this_term_dist < self.term_dist[frozendict(sym)]:
                                # if 'ring_size' in sym and sym['ring_size'] > 6:
                                #     print('aaa')
                                print('improving:', p, self.term_dist[frozendict(sym)], this_term_dist,
                                      [self.term_dist[frozendict(this_sym)] for this_sym in this_exp])
                                self.term_dist[frozendict(sym)] = this_term_dist

            if last_term_dist == self.term_dist:
                break
Example #16
0
 def read_productions(self, productions_filename):
     productions = []
     with io.open(productions_filename, 'r', encoding='utf8') as f:
         for line in f:
             line = line.strip()
             components = line.split(u'+')
             lhs = Nonterminal(components[0])
             rhs = tuple([
                 Nonterminal(nt.strip()) for nt in components[1].split(u' ')
             ])
             prob = float(components[2])
             pp = ProbabilisticProduction(lhs, rhs, prob=prob)
             productions.append(pp)
     self.grammar = PCFG(Nonterminal('S'), productions)
Example #17
0
def train():
    files = tb.fileids()
    data = list(tb.parsed_sents(files))

    # 80:20 split
    split = int(len(data) * 0.8)
    train_data = data[:split]
    test_data = data[split:]

    P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg(
        train_data)

    total_precision = 0
    toal_recall = 0
    total_f1_score = 0
    i = 0
    for test in test_data:
        print('Test', i)
        i += 1
        try:
            words = test.leaves()
            scores, backs = cky_parsing(words, copy(P_grammar),
                                        copy(P_non_terms), copy(P_vocab),
                                        copy(P_term_parents),
                                        copy(P_parents_count))
            start = Tree(Nonterminal('S'), [])
            if scores[0][len(words)][Nonterminal('S')] == 0:
                start = get_start(scores, len(words))
            predicted_tree = build_tree(start, 0, len(words), backs,
                                        P_non_terms)
            clean_tree(predicted_tree)
            predicted_tree.un_chomsky_normal_form()
            precision, recall, f1_score = evaluate(words, predicted_tree, test)
            print(precision, recall, f1_score)
            total_precision += precision
            toal_recall += recall
            total_f1_score += f1_score
        except:
            print('***************Failed', i - 1)
            continue

    total_precision /= len(test_data)
    toal_recall /= len(test_data)
    total_f1_score /= len(test_data)

    print('Precision', total_precision)
    print('Recall', toal_recall)
    print('F1_score', total_f1_score)
Example #18
0
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD):
    print("\ninitializing...")
    global ALPHA
    global LPG_DIFF_THRESHOLD
    global MC_THRESHOLD
    global and_symb_count
    global or_symb_count
    global ignore_mc_ec
    ALPHA = alpha
    LPG_DIFF_THRESHOLD = gd_thr
    MC_THRESHOLD = mc_thr
    and_symb_count = 0
    or_symb_count = 0
    ignore_mc_ec = False
    
    ## create an empty grammar G
    S = Nonterminal("_START_")
    R = [ProbabilisticProduction(S, [""], prob=1.)]
    G = PCFG(S, R)
    
    T = _create_t(C) # create a table T
    
    ## repeat until no further rule to be learned
    i = 0
    while not _finished(T):
        i += 1
        print("\niter. n° %d" % (i,))
        found, G, C, T, N = _learning_by_biclustering(G, C, T)
        if not found:
            print("NO MORE RULES CAN BE LEARNED")
            break
        G, C, T = _attaching(N, G, C, T)
    G = _postprocessing(G, C)
    print("\n", G) # DEBUG
    return G
Example #19
0
def binarize(grammar):
    """Binarize grammar by introducing new nonterminals"""
    result = []

    for rule in grammar.productions():
        if len(rule.rhs()) > 2:
            # this rule needs to be broken down
            left_side = rule.lhs()
            symbol_names = [
                tsym.symbol() if not isinstance(tsym, str) else '@' + tsym
                for tsym in rule.rhs()
            ]
            for k in range(1, len(rule.rhs()) - 1):
                new_rhs_name = rule.lhs().symbol() + '|<' + '-'.join(
                    symbol_names[k:]) + '>'
                new_sym = Nonterminal(new_rhs_name)
                new_production = Production(left_side,
                                            (rule.rhs()[k - 1], new_sym))
                left_side = new_sym
                result.append(new_production)
            last_prd = Production(left_side, rule.rhs()[-2:])
            result.append(last_prd)
        else:
            result.append(rule)

    n_grammar = CFG(grammar.start(), result)
    return n_grammar
Example #20
0
def convert_hybrid(grammar):
    '''
    Convert rules in the form of [A -> 'b' C] where the rhs has both non-terminals and terminals
    into rules in the form of [A -> B C] & [B -> 'b'] with a dummy non-terminal B
    '''
    rules = grammar.productions()
    new_rules = []
    for rule in rules:
        lhs = rule.lhs()
        rhs = rule.rhs()
        # check for hybrid rules
        if rule.is_lexical() and len(rhs) > 1:
            new_rhs = []
            for item in rule.rhs():
                if is_terminal(item):
                    new_sym = Nonterminal(item)
                    new_rhs.append(new_sym)
                    # add new lexical rule with dummy lhs nonterminal
                    new_rules.append(Production(new_sym, (item, )))
                else:
                    new_rhs.append(item)
            # add converted mixed rule with only non-terminals on rhs
            new_rules.append(Production(lhs, tuple(new_rhs)))
        else:
            new_rules.append(rule)

    new_grammar = CFG(grammar.start(), new_rules)

    return new_grammar
Example #21
0
    def parse(self, tokens):
        tagged = nltk.pos_tag(tokens)
        missing = False
        for tok, pos in tagged:
            if not self._grammar._lexical_index.get(tok):
                missing = True
                self._grammar._productions.append(
                    ProbabilisticProduction(Nonterminal(pos), [tok],
                                            prob=0.000001))


# WeightedProduction(Nonterminal(pos), [tok], prob=0.000001))
        if missing:
            self._grammar._calculate_indexes()

        # returns a generator, so call 'next' to get the ProbabilisticTree
        tree = super(PCFGViterbiParser, self).parse(tokens)
        if issubclass(tree.__class__, nltk.tree.Tree):
            print 'returning a tree'
            return tree
        elif isinstance(tree, types.GeneratorType):
            try:
                return next(tree)
            except (StopIteration):
                tweet = ' '.join(tokens)
                print u'Couldn\'t parse {}'.format(tweet)
                return None
        else:
            error("Type of tree is: {}".format(type(tree)))
Example #22
0
def code_to_sample (code, grammar, items=[Nonterminal("S")]):
    """Reconstructs expression and productions from parse tree encoding.
    Input:
        code - parse tree encoding in string format, as returned by generate sample
        grammar - PCFG object that was used to generate the code
        items - list containing start symbol for the grammar. Default: [Nonterminal("S")]
    Output:
        frags - expression in list form. Call "".join(frags) to get string.
        productions - list of used productions in string form. The parse tree is ordered top to bottom, left to right.
        code0 - auxilary variable, used by the recursive nature of the function. Should be an empty string. If not, something went wrong."""
    code0 = code
    frags = []
    productions=[]
    if len(items) == 1:
        if isinstance(items[0], Nonterminal):
            prods = grammar.productions(lhs=items[0])
            prod = prods[int(code0[0])]
            productions += [prod]
            frag, productions_child, code0 = code_to_sample(code0[1:], grammar, prod.rhs())
            frags += frag
            productions += productions_child
        else:
            frags += [items[0]]
    else:
        for item in items:
            frag, productions_child, code0 = code_to_sample (code0, grammar, [item])
            frags += frag
            productions += productions_child
    #print(frags, code0)
    return frags, productions, code0
Example #23
0
File: cfg.py Project: Geolem/nltk
def demo2():
    from nltk import Nonterminal, Production, CFG

    nonterminals = "S VP NP PP P N Name V Det"
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ["up", "over", NP]),
        # Lexical Productions
        Production(NP, ["I"]),
        Production(Det, ["the"]),
        Production(Det, ["a"]),
        Production(N, ["man"]),
        Production(V, ["saw"]),
        Production(P, ["in"]),
        Production(P, ["with"]),
        Production(N, ["park"]),
        Production(N, ["dog"]),
        Production(N, ["statue"]),
        Production(Det, ["my"]),
    )
    grammar = CFG(S, productions)

    text = "I saw a man in the park".split()
    d = CFGDemo(grammar, text)
    d.mainloop()
Example #24
0
File: cfg.py Project: wrand/tweater
def demo2():
    from nltk import Nonterminal, Production, ContextFreeGrammar
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ['up', 'over', NP]),

        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )
    grammar = ContextFreeGrammar(S, productions)

    text = 'I saw a man in the park'.split()
    d = CFGDemo(grammar, text)
    d.mainloop()
Example #25
0
 def productions(self):
     prod = []
     prod.append(Production(Nonterminal(self._label), self.children_name()))
     for i in self._child:
         if isinstance(i, Tree):
             prod.extend(i.productions())
     return prod
Example #26
0
def process_hybrid_productions(productions):
    new_productions_list = []  # list of new productions
    to_remove_list = []
    # Hybrid production
    for p in productions:
        is_hybrid = 0  # flag that indicates if current production is hybrid
        if len(p.rhs()
               ) > 1:  # more than one symbols are on the right hand side
            rh_list = []  # new list for right hand symbols
            for r_symbol in p.rhs():
                if is_terminal(r_symbol):  # for terminal symbol
                    dummy_symbol = Nonterminal(
                        r_symbol)  # create dummy nonterminal
                    new_productions_list.append(
                        Production(dummy_symbol,
                                   [r_symbol]))  # new unit production
                    rh_list.append(dummy_symbol)
                    is_hybrid = 1  # hybrid production confirmed
                else:  # for nonterminal symbol
                    rh_list.append(r_symbol)
            if is_hybrid:  # need to remove original production and add some productions
                # in the loop, we won't change the list. Store them first.
                new_productions_list.append(Production(
                    p.lhs(), rh_list))  # new production with dummy symbol
                to_remove_list.append(p)
    return to_remove_list, new_productions_list
Example #27
0
def if_then_else_demo():
    """
    Demo if-then-else grammar
    """
    from nltk.grammar import Nonterminal, Production, ContextFreeGrammar
    nonterminals = 'E E1 PLUS T T1 TIMES F LPAREN RPAREN ID'
    (E, E1, PLUS, T, T1, TIMES, F, LPAREN, RPAREN,
     ID) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        Production(E, [T, E1]),
        Production(E1, [PLUS, T, E1]),
        Production(E1, []),
        Production(T, [F, T1]),
        Production(T1, [TIMES, F, T1]),
        Production(T1, []),
        Production(F, [LPAREN, E, RPAREN]),
        Production(F, [ID]),
        Production(PLUS, ['+']),
        Production(TIMES, ['*']),
        Production(LPAREN, ['(']),
        Production(RPAREN, [')']),
        Production(ID, ['a']),
        Production(ID, ['b']),
        Production(ID, ['c']),
    )
    grammar = ContextFreeGrammar(E, productions)

    text = "a * b + c".split()
    RecursiveDescentApp(grammar, text).mainloop()
Example #28
0
def create_rule_series_helper(nonterminal_list):
    if len(nonterminal_list) < 2:
        return []
    else:
        new_symbol = get_symbol(nonterminal_list[1])
        for i in range(
                2, len(nonterminal_list)):  # combine last n-1 symbols as one
            new_symbol = new_symbol + '_' + get_symbol(nonterminal_list[i])
        lh_symbol = get_symbol(
            nonterminal_list[0]) + '_' + new_symbol  # symbol on the left hand
        productions = [
            Production(Nonterminal(lh_symbol),
                       (nonterminal_list[0], Nonterminal(new_symbol)))
        ]
        productions.extend(create_rule_series_helper(nonterminal_list[1:]))
        return productions
Example #29
0
def demo():
    from nltk import Nonterminal, parse_cfg
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
                                           for s in nonterminals.split()]
    
    grammar = parse_cfg("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    Det -> 'my'
    NP -> 'I'
    N -> 'dog'
    N -> 'man'
    N -> 'park'
    N -> 'statue'
    V -> 'saw'
    P -> 'in'
    P -> 'up'
    P -> 'over'
    P -> 'with'
    """)

    def cb(grammar): print grammar
    top = Tk()
    editor = CFGEditor(top, grammar, cb)
    Label(top, text='\nTesting CFG Editor\n').pack()
    Button(top, text='Quit', command=top.destroy).pack()
    top.mainloop()
    def __call__(self, last_actions):
        """
        Returns the 'smart' mask
        :param last_actions:
        :return:
        """
        if self.t >= self.MAX_LEN:
            raise StopIteration("maximum sequence length exceeded for decoder")

        mask = np.zeros([len(last_actions), len(self.grammar.GCFG.productions())])

        if self.S is None:
            # populate the sequences with the root symbol
            self.S = [[{'token': Nonterminal('smiles')}] for _ in range(len(last_actions))]
            for s in self.S:
                s[0]['term_dist'] = self.term_dist_calc(s[0])
            self.Stree =[[x for x in y] for y in self.S]

        for i, a in enumerate(last_actions):
            self.S[i], mask[i, :] = self.process_one_action(self.S[i], a)

        self.t += 1
        self.prev_actions = last_actions
        self.mask = mask
        return mask
Example #31
0
    def load(self, filepath):
        cfg_string = ''.join(list(open(filepath).readlines()))

        # parse from nltk
        cfg_grammar = nltk.CFG.fromstring(cfg_string)
        # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar)
        self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar)

        # our info for rule macthing
        self.head_to_rules = head_to_rules = {}
        self.valid_tokens = valid_tokens = set()
        rule_ranges = {}
        total_num_rules = 0
        first_head = None
        for line in cfg_string.split('\n'):
            if len(line.strip()) > 0:
                head, rules = line.split('->')
                head = Nonterminal(head.strip())    # remove space
                rules = [_.strip() for _ in rules.split('|')]    # split and remove space
                rules = [
                    tuple([Nonterminal(_) if not _.startswith("'") else _[1:-1] for _ in rule.split()])
                    for rule in rules
                ]
                head_to_rules[head] = rules

                for rule in rules:
                    for t in rule:
                        if isinstance(t, str):
                            valid_tokens.add(t)

                if first_head is None:
                    first_head = head

                rule_ranges[head] = (total_num_rules, total_num_rules + len(rules))
                total_num_rules += len(rules)

        self.first_head = first_head

        self.rule_ranges = rule_ranges
        self.total_num_rules = total_num_rules
Example #32
0
    def __init__(self, grammar):
        """
        grammar -- a binarised NLTK PCFG.
        """
        assert grammar.is_binarised()

        self.grammar = grammar
        self.start_sym = grammar.start().symbol()
        self._pi = defaultdict(dict)
        self._bp = defaultdict(dict)

        # Dicts with logprobs of lexical and unlexical productions
        self.prods_lps_lex = ps_lex = defaultdict(dict)
        self.prods_lps_unlex = ps_unlex = defaultdict(dict)
        for p in grammar.productions():
            # a str
            lhs = N.symbol(p.lhs())
            # a tuple of str
            rhs = p.rhs()
            if p.is_lexical():
                ps_lex[rhs][lhs] = p.logprob()
            else:
                rhs = tuple(map(N.symbol, p.rhs()))
                ps_unlex[rhs][lhs] = p.logprob()
Example #33
0
def slade(t, g):


	prods2 = {}
	for p in g.productions():
		if(p.lhs() not in prods2):
			prods2[p.lhs()] = []
		# If rhs is not in the lhs key, add it to the rhs list
		if(p.rhs() not in prods2.get(p.lhs())):
			prods2[p.lhs()].append(p.rhs())



	lcount = {}

	xs = [];
	# Dic with key = lhs, value list of rhs
	prods = {}

	# I'm gonna use this to avoid repeated Non Terminals
	rhs = set()

	# This set has all Insides non terminals
	inside = set()
	lastInside = -1
	x = -1
	i = -1

	# Count apparitions of non terminals lhs
	for p in noRepeats(t.productions()):
		lcount[p.lhs()] = lcount.get(p.lhs(), 0) + 1
		print p


	# add inside non terminals to inside set
	for key, value in lcount.iteritems():
		if value > 1:
			inside.add(key)
 
	# Nodes in postOrder
	postOrder = t.treepositions(order='preorder')
	for n in postOrder:
		if type(t[n]) is not str:
			currentProd = t[n].productions()[0]


			# Add prods to dict where key is lhs and value a list of the rhs
			if(currentProd.lhs() not in prods):
				prods[currentProd.lhs()] = []

			# If rhs is not in the lhs key, add it to the rhs list
			if(currentProd.rhs() not in prods.get(currentProd.lhs())):
				prods[currentProd.lhs()].append(currentProd.rhs())
				

			# Remember last inside used, to add the +x
			if(currentProd.lhs() in inside):
				lastInside = int(Nonterminal.symbol(currentProd.lhs()))
				x = prods2[currentProd.lhs()].index(currentProd.rhs())
				xs.append(x)
				i = 0
			else:
				i = prods[currentProd.lhs()].index(currentProd.rhs())

			# # change node name based on Inside
			# if(Nonterminal.symbol(currentProd.lhs()) != 'S'):
			# 	if(int(Nonterminal.symbol(currentProd.lhs())) == lastInside + 1):
			# 		t[n].node = t[n].node + "(" + str(i) + ")" + " + " + str(x)
			# 		xs.append(x);
			# 		# t[n].node = t[n].node + str(i)  + str(x)
			# 	else: 
			# 		t[n].node = t[n].node + "(" + str(i) + ")"
			# 		# t[n].node = t[n].node + str(i) 

	# Print parse tree productions
	# 
	
	# print "Productions: "
	# noRep = noRepeats(t.productions())
	# for p in noRep:
	# 	if p.rhs() in rhs:
	# 		noRep.remove(p)
	# 	rhs.add(p.rhs())	

	# total = 0
	# for p in noRep:
	# 	total += p.__len__() + 1
	# 	print p

	print "ENCODING"
	print xs

	# draw parse tree
	t.draw()