Beispiel #1
0
    def get_grammar(cls, train_trees, starting_symb='SENT'):
        """
        This method returns a the grammar coputed from the training set.

        Inputs:
        -------

        train_trees (list): List of trees to perform training
        startting_symbol (str): The root symbol
        """
        productions = []

        # Chmosky Normal Form
        for tree in train_trees:
            
            # Remove unary rules
            treetransforms.collapse_unary(tree)

            # Transform to CNF
            treetransforms.chomsky_normal_form(tree, horzMarkov=2)

            # Copute production and store is
            productions += tree.productions()

        # Define the root symbol
        SENT = Nonterminal(starting_symb)

        # Compute the grammar using PCFG
        grammar = induce_pcfg(SENT, productions)

        grammar.chomsky_normal_form()

        return grammar
Beispiel #2
0
    def _induce_pcfg(self, corpus):
		"""
		Induce PCFG del corpus considerando lexicalización en primer nivel y grupos verbales.
		"""
		prods = sum((lexicalize(t, grup="grup.verb").productions() for t in lemmatized_sents(corpus.corpus)), [])			
		S = nltk.Nonterminal('sentence')
		return nltk.induce_pcfg(S, prods)
Beispiel #3
0
def get_bigram_and_deep_syntax_feature(review, speller, stop_words, ps, preprocess):
    res = ""
    productions = []

    parser = CoreNLPParser(url='http://localhost:9500')

    for sentence in re.split(r"[.!?]", review):
        try:
            tree = next(parser.raw_parse(sentence))

            # Optimize by creating Chomsky normal form
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)
            productions += tree.productions()

        except StopIteration:
            # End of review reached
            break

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)

    count = 0
    for line in str(grammar).split("\n"):
        if count == 0:
            count += 1
            continue
        elif "'" in line:
            res += re.sub(r"[(->) `\'\"\[\d\]]", "", line) + " "

    res += bipos.get_bigrams_and_unigrams_of_sentence(
        bow.sanitize_sentence(review, speller, stop_words, ps, preprocess))

    return res
def create_pcfg_from_treebank(pickle_it=False, log_it=False, filename="treebank", full=False):
    """
    Creates a PCFG from the Penn Treebank dataset using induce_pcfg
    Optional pickling of this PCFG in pickled-vars/
    """
    if full:
        tb = ptb
    else:
        tb = treebank
    productions = []
    flat_trees = 0
    for item in tb.fileids(): # Goes through all trees
        for tree in tb.parsed_sents(item):
            if tree.height() == 2:  # Gets rid of flat trees
                # print("####Tree not collected#####")
                flat_trees += 1
                continue
            # print(" ".join(tree.leaves()))    # This should print the sentences
            # perform optional tree transformations, e.g.:
            # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
            # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
            productions += tree.productions()
    print("%s Flat trees purged" % flat_trees)

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    if pickle_it:
        pickle.dump(grammar, open("%s%s-grammar.p" % (var_dir, filename), "wb"))
    if log_it:
        save_grammar_cleartext(grammar, filename)
        save_lexicon_cleartext(grammar, filename)
    return grammar
Beispiel #5
0
def pcfg_reverse(word):
    s = build_tree(word, 0)
    tree = nltk.Tree.fromstring(s)
    productions = tree.productions()
    for p in productions:
        
        ##################################################
        # !!! THIS IS WHERE THE MAGIC HAPPENS !!!        #
        if len(p._rhs) > 1:                              #
            p._rhs = (p._rhs[1], p._rhs[0])              #
            ##############################################
            
    grammar = nltk.induce_pcfg(nltk.Nonterminal("N0"), productions)
#     print(grammar)     # UNCOMMENT FOR A FUN TIME!
    parser = nltk.pchart.InsideChartParser(grammar)
    
    # Shuffle to generate 1000 possible words; only the correct
    # solution will be parseable with our grammar!
    for i in range(1000):
        cand = random.sample(word, len(word))
#         print(cand)               # UNCOMMENT FOR A FUN TIME!
        parser.parse(cand)
        for parse in parser.parse(cand):
            if parse._ProbabilisticMixIn__prob > 0:
#                 print("number of tries: {}".format(i))  # UNCOMMENT!
                return "".join(cand)
    return "no reverse found, try again"
Beispiel #6
0
def pcfg_demo():
    """
    A demonstration showing how C{WeightedGrammar}s can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    grammar = toy_pcfg2
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use string.replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    print()

    print('Coverage of input words by a grammar:')
    print(grammar.covers(['a', 'boy']))
    print(grammar.covers(['a', 'girl']))

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.items[:2]:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
    print(sent)
    for parse in parser.nbest_parse(sent):
        print(parse)
Beispiel #7
0
    def _induce_pcfg(self, corpus):
        """
        Induce PCFG del corpus considerando lexicalización en primer nivel.
        """
        
        #stemmer = SpanishStemmer()
        
        S = nltk.Nonterminal('sentence')
        
        arboles = []
        for tree in corpus.corpus.parsed_sents():
            # Trasnformamos los arboles para obtener las reglas en Forma Normal de Chomsky.
            #tree.collapse_unary(collapsePOS = True, collapseRoot = True)
            #tree.chomsky_normal_form(horzMarkov = 2)
            arboles.append(tree.copy())      
    
        # Hago parent annotation del nivel superior a las hojas utilizando el lema de la hoja.
        productions = []
        for arbol in arboles:
            for t in arbol.treepositions('leaves'):
                arbol[t] = arbol[t].lower()
                t_p = tuple(x[1] for x in enumerate(t) if x[0] != len(t)-1)
                arbol[t_p].set_label(arbol[t_p].label() + "#" + corpus.obtener_lema(arbol[t]))
            productions.extend(arbol.productions())


        return nltk.induce_pcfg(S, productions)
Beispiel #8
0
def create_grammar() -> PCFG:
    # 21,763 productions with word terminals
    # 8,028 productions with pos terminals
    # 6,275 productions with nonterminals without digits
    # 5,402 productions with nonterminals without punctuation
    # 2,972 productions with nonterminals without suffixes
    # 707 nonterminals
    # 190 nonterminals without digit labels
    # 180 nonterminals without punctuation
    # 63 nonterminals without suffixes
    productions = []
    start_symbol = Nonterminal('S')
    for tree in nltk.corpus.treebank.parsed_sents():
        for production in tree.productions():
            if not valid_nonterminal(production.lhs()):
                continue
            if isinstance(production.rhs()[0], Nonterminal):
                lhs = simplify_nonterminal(production.lhs())
                rhs = tuple(
                    simplify_nonterminal(t) for t in production.rhs()
                    if valid_nonterminal(t))
                productions.append(Production(lhs, rhs))
            else:
                simplified = simplify_nonterminal(production.lhs())
                productions.append(
                    Production(simplified, (simplified.symbol(), )))

    grammar = nltk.induce_pcfg(start_symbol, productions)
    #print(grammar.productions())
    print(len(grammar.productions()))
    nonterminals = set(prod.lhs() for prod in grammar.productions())
    print(sorted(nonterminals))
    print(len(nonterminals))
    return grammar
Beispiel #9
0
    def induce_weights(self, sentences):
        if self.grammar is None:
            raise Exception("Need to call induce_structure first")

        sentences = [[c for c in s] for s in sentences]

        log_prob_last = 0
        log_prob_curr = float('inf')

        while abs(log_prob_last - log_prob_curr) > 0.0001:
            log_prob_last = log_prob_curr

            parser = ViterbiParser(self.grammar)

            productions = []
            log_prob_curr = 0
            for i, sent in enumerate(sentences):
                print("parsing sentence %i of %i" % (i, len(sentences)))
                found = False
                for tree in parser.parse(sent):
                    found = True
                    log_prob_curr += tree.logprob()
                    productions += tree.productions()
                if not found:
                    print(sent)
                    raise Exception("Unable to parse sentence")

            # print("last log prog", log_prob_last)
            print("curr log prob", log_prob_curr)

            self.grammar = nltk.induce_pcfg(self.start, productions)
Beispiel #10
0
def learn_PCFG(text_set, start_token):
    s = Nonterminal(start_token)
    production_list = []
    for sent in text_set:
        production_list += sent.productions()

    return induce_pcfg(s, production_list)
Beispiel #11
0
def pcfg_demo():
    """
    A demonstration showing how C{WeightedGrammar}s can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print 'A PCFG production:', ` pcfg_prod `
    print '    pcfg_prod.lhs()  =>', ` pcfg_prod.lhs() `
    print '    pcfg_prod.rhs()  =>', ` pcfg_prod.rhs() `
    print '    pcfg_prod.prob() =>', ` pcfg_prod.prob() `
    print

    grammar = toy_pcfg2
    print 'A PCFG grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26)
    print

    print 'Coverage of input words by a grammar:'
    print grammar.covers(['a', 'boy'])
    print grammar.covers(['a', 'girl'])

    # extract productions from three trees and induce the PCFG
    print "Induce PCFG grammar from treebank data:"

    productions = []
    for item in treebank.items[:2]:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print grammar
    print

    print "Parse sentence using induced grammar:"

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
    print sent
    for parse in parser.nbest_parse(sent):
        print parse
Beispiel #12
0
    def generate_pcfg_productions(self, questionbank):
        productions = []

        with io.open(questionbank, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                sent_text = nltk.sent_tokenize(line)

                for sentence in sent_text:
                    #print sentence
                    ss = self.parser.raw_parse_sents((sentence, ))
                    for k in ss:
                        for s in k:
                            buf = "%s" % s
                            buf = six.text_type(buf)
                            s1 = Tree.fromstring(buf)

                            #get rid of the ROOT
                            for node in s1:
                                if node.label() == 'ROOT':
                                    continue
                                else:
                                    s1 = node
                                    break
                            s1.chomsky_normal_form(horzMarkov=2)
                            pdc = []
                            for p in s1.productions():
                                #remove the lexical production
                                if not p.is_lexical():
                                    pdc.append(p)

                            productions += pdc

        S = Nonterminal('S')
        self.grammar = induce_pcfg(S, productions)
Beispiel #13
0
    def _induce_pcfg(self, corpus):
        """
        Induce PCFG grammar del corpus (treebank) considerando palabras UNK.
        """
        
        unk_words = {k for k,v in self.wordfrecs.iteritems() if v == 1}
        
        productions = []
        for tree in corpus.corpus.parsed_sents():
            # Trasnformamos los arboles para obtener las reglas en Forma Normal de Chomsky.
            #tree.collapse_unary(collapsePOS = True, collapseRoot = True)
            #tree.chomsky_normal_form(horzMarkov = 2)
            
            # Transformo todas las hojas a minusculas.
            for t in tree.treepositions('leaves'):
                tree[t] = tree[t].lower()
            
            productions += tree.productions()

        new_productions = []
        for pr in productions:
            if len(pr.rhs()) == 1 and pr.rhs()[0] in unk_words:
                new_pr = nltk.grammar.Production(pr.lhs(), ['UNK'])
                new_productions.append(new_pr)
            else:
                new_productions.append(pr)
        
        S = nltk.Nonterminal('sentence')
        
        return nltk.induce_pcfg(S, new_productions)
Beispiel #14
0
    def fit_pcfg(self, X):

        if self.fitted_pcfg:
            raise ValueError("PCFG.pcfg already fitted")

        productions = []
        for sentence in X:
            # nltk format
            t = nltk.tree.Tree.fromstring(sentence,
                                          remove_empty_top_bracketing=True)
            # chomky normal form
            self.chomkysation(t)
            #rules exraction
            rules = self.extract_rules(t, lexical=False)
            productions.extend(rules)

        start = nltk.Nonterminal('SENT')
        self.pcfg_ = nltk.induce_pcfg(start, productions)
        self.pcfg_.chomsky_normal_form(flexible=False)

        #get tokens
        for prod in self.pcfg_._productions:
            for token in prod._rhs:
                if not token == 'SENT':
                    self.non_terminals.append(token)
        self.non_terminals.insert(0, start)

        #get tokens2index
        self.pos2index = {}
        for i, token in enumerate(self.non_terminals):
            self.pos2index[token] = i

        self.fitted_pcfg = True
Beispiel #15
0
def train_grammar(unknown_words=[], nb_reduced_production=6000):

    productions = []

    for item in train:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)  # Remove branches A-B-C into A-B+C
            tree.chomsky_normal_form(horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
            #tree_prods = tree.productions()


            productions += tree.productions()


    counter = collections.Counter(productions)
    n_comms = [item for item, count in counter.most_common(nb_reduced_production) for i in range(count)]

    #Adding unkwown words and terminal rules back into the reduced productions set
    unknown_words_prods = []
    for p in productions:
        if isinstance(p._rhs[0], str):
            unknown_words_prods.append(p)
            for u in unknown_words:
                rhs = [u]
                lhs = p._lhs
                new_prod = Production(lhs, rhs)
                unknown_words_prods.append(new_prod)


    n_comms += unknown_words_prods
    S = Nonterminal('S')
    grammar = induce_pcfg(S, n_comms)

    return grammar
def to_pcfg(sequences, sections):
    sequences = [s[s >= 0] for s in sequences]
    trees = [Tree.fromstring(to_tree(s, sections)) for s in sequences]
    # [t.collapse_unary(collapsePOS = False) for t in trees]
    # [t.chomsky_normal_form(horzMarkov = 2) for t in trees]
    prods = [p for t in trees for p in t.productions()]
    print(induce_pcfg(Nonterminal('S'), prods))
Beispiel #17
0
def generate_grammar(f, s):
    productions = []
    for line in f:
        tree = Tree.fromstring(line)
        for prodution in tree.productions():
            productions += [prodution]
    return nltk.induce_pcfg(s, productions)
Beispiel #18
0
def get_parser(training_trees):
    training_prods = sum([list(tree_to_productions(t)) for t in training_trees], list())
    pos_rules = [Production(Nonterminal(lhs), ["$" + rhs]) for lhs, rhs in GRAMMAR_TO_POS.iteritems()]
    training_prods += pos_rules
    training_pcfg = induce_pcfg(Nonterminal("S"), training_prods)
    parser = ViterbiParser(training_pcfg)

    return parser, training_pcfg
Beispiel #19
0
    def _induce_pcfg(self, corpus):
		"""
		Induce PCFG del corpus.
		"""
		prods = sum((t.productions() for t in corpus.corpus.parsed_sents()),[])
		S = nltk.Nonterminal('sentence')
		grammar = nltk.induce_pcfg(S, prods)
		return grammar
Beispiel #20
0
 def _train_rules_grammar(self):
     print("training grammar")
     self._grammar = nltk.induce_pcfg(
         nltk.Nonterminal('TOP'),
         reduce(lambda a, b: a + b,
                map(lambda t: t.productions(), self._treebank)))
     if RUN_MODE == PURE_CKY_M and UNKOWN_MODE:
         add_unknowns(self._grammar)
     print("finished grammar training")
Beispiel #21
0
 def build_context_free_grammar(self, data):
     productions = []
     for tree in [Tree.fromstring(tree) for tree in data]:
         tree.collapse_unary(collapsePOS=False)
         tree.chomsky_normal_form(horzMarkov=2)
         productions += tree.productions()
     starting_state = Nonterminal('SENT')
     grammar = induce_pcfg(starting_state, productions)
     return grammar
Beispiel #22
0
    def create_pcfg(self, trees):
        productions = []
        for tree in trees:
            tree.collapse_unary(collapsePOS=True)
            tree.chomsky_normal_form(horzMarkov=2)
            productions += tree.productions()

        S = Nonterminal('SENT')
        grammar = induce_pcfg(S, productions)

        return grammar
Beispiel #23
0
def induce(trees: Iterable) -> FancyPCFG:
    productions = []
    for tree in trees:
        #        tree.pretty_print()
        # perform optional tree transformations, e.g.:
        # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        productions += tree.productions()
    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    return FancyPCFG.fromCFG(grammar)
Beispiel #24
0
def gen_sample_data_grammars(data, root, grammar_format):
    ''' Generate cfg/pcfg (grammar_format) from nltk.Tree datalist '''
    grammars = []

    for _data in data:
        for production in _data.productions():
            grammars.append(production)

    if grammar_format == constants.APP_DEFAULT_PCFG_FORMAT:
        grammars = nltk.induce_pcfg(root, grammars)

    return grammars
Beispiel #25
0
    def induce_structure(self, sentences):

        sentences = [[c for c in s] for s in sentences]

        start_symbols = set()
        productions = []
        prod_table = {}

        # group all digits together
        digit_terminals = set([str(i) for i in range(10)])

        # unary rules
        terminals = set()
        for s in sentences:
            terminals.update(s)
        for t in terminals:
            if t in digit_terminals:
                nt = nltk.Nonterminal("Digit")
            else:
                nt = nltk.Nonterminal("Unary%s" % self.gen_nt())
            p = Production(nt, [t])
            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

        sentences = self.apply_unary_prod(sentences, prod_table)

        while len(sentences) > 0:
            if self.has_recursion(sentences):
                p = self.generate_recursive_prod(sentences)
            else:
                p = self.generate_most_frequent_prod(sentences)

            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

            sentences = self.update_with_prod(sentences, prod_table)

            new_sentences = []
            for s in sentences:
                if len(s) == 1:
                    start_symbols.add(s[0])
                else:
                    new_sentences.append(s)

            sentences = new_sentences

        # generate the start productions
        for symbol in start_symbols:
            for p in productions:
                if p.lhs() == symbol:
                    productions.append(Production(self.start, p.rhs()))

        self.grammar = nltk.induce_pcfg(self.start, productions)
Beispiel #26
0
    def _induce_pcfg(self, corpus):
		"""
		Induce PCFG grammar del corpus (treebank) considerando palabras UNK.
		"""
		def induce_unk(tree):
			for prod in tree.productions():
				if prod.is_lexical() and self.wordfrecs[prod.rhs()[0]] == 1:
					yield nltk.Production(prod.lhs(),["UNK"])
				else: yield prod
		prods = sum((list(induce_unk(t)) for t in corpus.corpus.parsed_sents()), [])
		S = nltk.Nonterminal('sentence')
		return nltk.induce_pcfg(S, prods)
def getGrammar():

    fileid = treebank.fileids()
    trainfiles = fileid[:160]
    #testfiles=fileid[0.8*len(fileid):]

    productions = []
    for item in trainfiles:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(
                collapsePOS=False)  # Remove branches A-B-C into A-B+C
            tree.chomsky_normal_form(
                horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
            productions += tree.productions()

    lhs_prod = [p.lhs() for p in productions]
    rhs_prod = [p.rhs() for p in productions]
    set_prod = set(productions)

    list_prod = list(set_prod)

    token_rule = []
    for ele in list_prod:
        if ele.is_lexical():
            token_rule.append(ele)

    set_token_rule = set(p.lhs() for p in token_rule)
    list_token_rule = list(set_token_rule)
    corr_list_token_rule = []
    for word in list_token_rule:
        if str(word).isalpha():
            corr_list_token_rule.append(word)
            continue
    #print(corr_list_token_rule)

    import nltk
    a = []
    for tok in corr_list_token_rule:
        #lhs = nltk.grammar.Nonterminal('UNK')
        lhs = 'UNK'
        rhs = [u'UNK']
        UNK_production = nltk.grammar.Production(lhs, rhs)
        lhs2 = nltk.grammar.Nonterminal(str(tok))
        a.append(nltk.grammar.Production(lhs2, [lhs]))

    token_rule.extend(a)

    list_prod.extend(a)

    S = Nonterminal('S')
    grammar = induce_pcfg(S, list_prod)
    return grammar
Beispiel #28
0
def update_grammar(productions, unknown):
    lis = pos_tagger.tag(unknown)
    for i in range(len(lis)):
        pos = nonterminals(lis[i][1])[0]
        production_ = Production(pos, [unknown[i]])
        productions.append(production_)
        print production_, "added to productions"

    S = Nonterminal('SENT')
    grammar = induce_pcfg(S, productions)

    return grammar
Beispiel #29
0
 def build(self, examples=tuple()):
     """
     :param examples:    tuple or list of nltk Trees
     :return: 
     """
     allproductions = []
     for example in examples:
         q = example
         t = self.grammarify(q)
         t = Tree("S", [t])
         productions = t.productions()
         allproductions += productions
     pcfg = nltk.induce_pcfg(Nonterminal("S"), allproductions)
     return pcfg
Beispiel #30
0
 def createGrammar(self, userMessages, ctx):
     parser = CoreNLPParser(url='http://localhost:9000')
     parse_trees = []
     for message in userMessages:
         tokenized = nltk.sent_tokenize(message)
         for sentence in tokenized:
             parse_trees.append(list(parser.raw_parse(sentence))[0])
     grammar_rules = set()
     for tree in parse_trees:
         for production in tree.productions():
             grammar_rules.add(production)
     start = nltk.Nonterminal('S')
     grammar = nltk.induce_pcfg(start, grammar_rules)
     return (' '.join((self.generate_sentence(grammar))))
def main():
    # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0])
    # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw()

    # print("Induce PCFG grammar from treebank data:")
    #
    productions = []
    print(len(treebank.fileids()))
    for item in treebank.fileids(): # Goes through all trees
      for tree in treebank.parsed_sents(item):
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        productions += tree.productions()
    # #
    # # print(type(productions[0]))
    # #
    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # # # print(grammar)    # This is a PCFG
    # pickle.dump(grammar, open("tbank-grammar.p", "wb"))
    # t = time.time()
    # grammar = pickle.load(open("tbank-grammar.p", "rb"))
    # textf = open("lexicon.txt", "w")
    # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n"))))))
    # textf.close()
    # print(time.time()-t)
    parser = ViterbiParser(grammar)
    # pickle.dump(parser, open("cky-parser.p", "wb"))
    # parser = pickle.load(open("cky-parser.p", "rb"))
    parser.trace(0)
    sent = "John will join the board"
    tokens = sent.split()

    try:
        grammar.check_coverage(tokens)
        print("All words covered")
        parses = parser.parse_all(tokens)
        if parses:
            lp = len(parses)
            print(lp)
            print(parses[0].label())
            # parses[0].draw()
            p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0)
        else:
            p = 0

        print("Probability:", p)
    except:
        print("Some words not covered")
def test_pcfg():
    #trees = ['(S (0 1 2) (0 2 3))', '(S (0 1 2) (0 4 5))', '(S (1 1 2) (1 2 3))']
    #trees = ['(S (0 1 2) (3 4 5 6))', '(S (0 1 2) (7 8) (3 4 5 6))', '(S (0 1 2) (9 4) (10 6))']
    trees = [
        '(S (0 1 2) (3 4 5 6))', '(S (0 1 2) (7 8) (3 4 5 6))',
        '(S (0 1 2) (3 4 6))'
    ]
    trees = [Tree.fromstring(t) for t in trees]
    print(trees)
    prods = [p for t in trees for p in t.productions()]
    print(prods)
    grammar = induce_pcfg(Nonterminal('S'), prods)
    print(grammar)
    sequences = [t.leaves() for t in trees]
    description_length(grammar, sequences)
    def __init__(self, training_set, vocabulary):
        """ Initializer for the OmissionDetector
        Trains a PCFG and a HMM to aid in omission detection

        Args:
        training_set -- list of parsed sentences
        vocabulary -- set of known words
        """
        prods = flatten([tree.productions() for tree in training_set])
        pcfg = induce_pcfg(Nonterminal("S"), prods)
        self._grammar = InvertedGrammar(pcfg)

        tagged_training_set = [tagged_sent(s) for s in training_set]
        self._hmm = NgramHMM(TEST_N, vocabulary)
        self._hmm.train(tagged_training_set)
Beispiel #34
0
def pcfg_train(trees, vocab):
    #    Write a function pcfg_train() that takes as its input a collection
    #    of nltk.tree.Tree objects. For example, it might be passed some
    #    portion of nltk.corpus.treebank.parsed_sents(). This function
    #    should return a nltk.PCFG object.

    all_productions = []

    for t in trees:
        for p in t.productions():
            all_productions.append(nltk.Production(p.lhs(), p.rhs()))

    pcfg = nltk.induce_pcfg(nltk.Nonterminal('S'), all_productions)

    return (pcfg)
Beispiel #35
0
def ex7():
    """
    Using NLTK's own library to generate the probabilities.
    """
    productions = [
        p for tree in treebank.parsed_sents() for p in tree.productions()
    ]
    pcfg = induce_pcfg(Nonterminal("S"), productions)
    # print(pcfg.productions())
    parser = pchart.InsideChartParser(pcfg, beam_size=800)

    for sent in sentences1:
        parsed = list(parser.parse(sent.split()))
        print("Parsing sent: {}".format(sent))
        print(parsed[0])
Beispiel #36
0
def parse(sentence: str,
          save: bool = True,
          index: str = str(uuid.uuid4())) -> dict:
    results = dict()

    sentence = preprocess(sentence)

    print(f'Sentence: {sentence}')
    results['speech_text'] = sentence

    tokenized = list(pos_parser.tokenize(sentence))

    print(f'Tokenized: {tokenized}')
    print(f'Total words: {len(tokenized)}\n')

    tagged = list(pos_parser.tag(tokenized))
    print(f'Tagged: {tagged}\n')
    results['pos_tagged'] = tagged

    ne_tags = ne_chunk(tagged)
    entities = [(tag.label(), ' '.join(t[0] for t in tag)) for tag in ne_tags
                if hasattr(tag, 'label')]
    entities.extend(tag_entities(tagged))
    print(f'Entities: {entities}\n')
    results['named_entities'] = entities

    parsed = next(pos_parser.raw_parse(sentence))
    print('Grammar')
    parsed.pretty_print()

    root = Nonterminal('S')
    grammar = induce_pcfg(root, parsed.productions())
    print(grammar, '\n')

    productions = [(str(prod._lhs), [str(t) for t in prod._rhs], prod.prob())
                   for prod in grammar._productions]
    results['productions'] = productions

    probabilities = [prod.prob() for prod in grammar._productions]
    pcfg = reduce(mul, probabilities)

    print(pcfg, '\n')
    results['pcfg'] = pcfg

    if save:
        results['tree_bin'] = save_image(parsed, index)

    return results
def pcfgParse(sentence):
    
    productions = list()
    root = nltk.Nonterminal('S')
    
    for tree in nltk.corpus.treebank.parsed_sents():
        productions += tree.productions()

    grammar = nltk.induce_pcfg(root, productions)
    PCFGParser = nltk.ViterbiParser(grammar)

    s_sent = sentence.split()
    parsed_sent = PCFGParser.parse(s_sent)

    for p in parsed_sent:
        print p
Beispiel #38
0
def getParser():
    """


    :return: A Viterbi Parser
    """
    productions = []
    S = nltk.Nonterminal('S')
    for tree in train_corpus:
        productions += tree.productions()
    grammar = nltk.induce_pcfg(S, productions)

    for p in islice(grammar.productions(), 50):
        print p

    return nltk.ViterbiParser(grammar)
Beispiel #39
0
def construct_grammar(f, S):
    productions = []
    for line in f:
        tree = Tree.fromstring(line)
        # 把树chomsky_normal_form
        # tree.collapse_unary(collapsePOS = True, collapseRoot = True)
        # tree.chomsky_normal_form(horzMarkov = 2)
        # tree.set_label('TOP')
        for prodution in tree.productions():
            if len(prodution.rhs()) == 1 and isinstance(
                    prodution.rhs()[0], nltk.Nonterminal):
                print(prodution.rhs())
            productions += [prodution]
    with open('productions.txt', 'w') as f:
        f.write(str(productions))
    return nltk.induce_pcfg(S, productions)
def PCFG_Section():
    toy_pcfg1 = PCFG.fromstring("""
        S -> NP VP [1.0]
        NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
        Det -> 'the' [0.8] | 'my' [0.2]
        N -> 'man' [0.5] | 'telescope' [0.5]
        VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
        V -> 'ate' [0.35] | 'saw' [0.65]
        PP -> P NP [1.0]
        P -> 'with' [0.61] | 'under' [0.39]
    """)

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', pcfg_prod)
    print('pcfg_prod.lhs()  =>', pcfg_prod.lhs())
    print('pcfg_prod.rhs()  =>', pcfg_prod.rhs())
    print('pcfg_prod.prob() =>', pcfg_prod.prob())

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.fileids()[:2]:
      for tree in treebank.parsed_sents(item):
        # print(" ".join(tree.leaves()))
        # perform optional tree transformations, e.g.:
        # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        prods = tree.productions()
        # print(prods[0].prob())
        productions += prods

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # print(grammar)    # This is a PCFG

    ### Parsing section below ###

    print("\nParse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(1)

    sent = treebank.parsed_sents('wsj_0001.mrg')[0]
    print(sent.prob())
Beispiel #41
0
    def _induce_pcfg(self, corpus):
        """
        Induce PCFG del corpus.
        """
        
        productions = []
        S = nltk.Nonterminal('sentence')
        for tree in corpus.corpus.parsed_sents():
            # Trasnformamos los arboles para obtener las reglas en Forma Normal de Chomsky.
            #tree.collapse_unary(collapsePOS = True, collapseRoot = True)
            #tree.chomsky_normal_form(horzMarkov = 2)
            
            # Transformo todas las hojas del arbol a minuscula
            for t in tree.treepositions('leaves'):
                tree[t] = tree[t].lower()
            
            productions += tree.productions()

        return nltk.induce_pcfg(S, productions)
Beispiel #42
0
    def _induce_pcfg(self, corpus):
        """
        Induce PCFG del corpus considerando lexicalización en primer nivel y grupos verbales.
        """
        S = nltk.Nonterminal('sentence')
        
        # Hago parent annotation del nivel superior a las hojas utilizando el lema de la hoja.
        productions = []
        for arbol in corpus.corpus.parsed_sents():
            for t in arbol.treepositions('leaves'):
                arbol[t] = arbol[t].lower()
                t_p = tuple(x[1] for x in enumerate(t) if x[0] != len(t)-1)
                arbol[t_p].set_label(arbol[t_p].label() + "#" + corpus.obtener_lema(arbol[t]))
                verbo = corpus.obtener_lema(arbol[t])
                if verbo.endswith(("ar", "er", "ir",)) and "_" not in verbo:
                    t_p2 = tuple(x[1] for x in enumerate(t_p) if x[0] != len(t_p)-1)
                    if arbol[t_p2].label() == "grup.verb":
                        arbol[t_p2].set_label(arbol[t_p2].label() + "#" + verbo)
            productions.extend(arbol.productions())

        return nltk.induce_pcfg(S, productions)
Beispiel #43
0
def train_pcfg():
    print 'training grammar'
    productions = []
    # print len(treebank.fileids())
    trees = []
    # up to 199 less for shorter grammar for quicker training
    for fileid in treebank.fileids()[0:20]:
        for tree in treebank.parsed_sents(fileid):
            # perform optional tree transformations, e.g.:
            # Remove branches A->B->C into A->B+C so we can avoid infinite
            # productions
            tree.collapse_unary(collapsePOS=False)
            # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser)
            # horizontal and vertical Markovization: remember parents and siblings in tree
            #     This gives a performance boost, but makes the grammar HUGE
            #     If we use these we would need to implement a tag forgetting method
            #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0)
            tree.chomsky_normal_form()
            productions += tree.productions()
    S = nltk.Nonterminal('S')
    grammar = nltk.induce_pcfg(S, productions)
    print "grammar trained!"
    return grammar
path = 'ancora/ancora-3.0.1es/'
corpus = ancora.AncoraCorpusReader(path)


t = corpus.parsed_sents()[0]
t.draw()
t.productions()

prods = []
for t in corpus.parsed_sents():
    prods += t.productions()
    
#print (prods)

S = nltk.Nonterminal('sentence')
grammar = nltk.induce_pcfg(S, prods)



#prods2 = grammar.productions(lhs=nltk.Nonterminal('ncms000'))
#print (prods2)

print ("===============================================================")
print ("===============================================================")

parser = nltk.ViterbiParser(grammar)
for tree in parser.parse("El gato come pescado crudo .".split()):
    print (tree)
    tree.draw()
    tree.prob()
Beispiel #45
0
      
      

print('done!')


#merge productions
Production += Production_singleCharWord


#
# inducing PCFG from the productions
#
print('\n\nInducing PCFG from the producitons occurre the treebank...')

W=nltk.Nonterminal('W')

baseline_grammar=nltk.induce_pcfg(W, Production)

print('done!')

path_grammar='../working_data/baseline.grammar.pickle'

print('\nSaving the induced grammar to',path_grammar,' ...')
f=open(path_grammar, 'wb')
pickle.dump(baseline_grammar, f)
f.close()
   
  

Beispiel #46
0
def PCFGlearning(dataset, start):
    production_list = []
    S = Nonterminal(start)
    for sent in dataset:
        production_list += sent.productions()
    return induce_pcfg(S, production_list)
Beispiel #47
0
# normalize the c structures
for t in grammar_used:
    t.chomsky_normal_form()
tbank_productions2 = list(treebank.sents())
test_part = tbank_productions2[int(len(tbank_productions) * 0.8):]


# prodcutions
productions = []
for t in grammar_used:
    productions += Tree.productions(t)

# induce PCFG
S = nltk.Nonterminal("S")
grammar = nltk.induce_pcfg(S, productions)
prod = grammar.productions()

#helping function to get the probability of a production
def findProb(lhsa, rhsa, prod):
    for p in prod:
        if p.lhs() == lhsa and p.rhs() == rhsa:
            return (p.prob())


def CKY(words, grammar):
    nonterms = set()
    for g in grammar.productions():
        nonterms.add(g.lhs())
    triples =[]
    lenwords = len(words)
from nltk import induce_pcfg
from nltk import treetransforms
from nltk.corpus import treebank
from nltk.grammar import Nonterminal
from nltk.parse import pchart

productions = []
for tree in treebank.parsed_sents():
	# perform optional tree transformations, e.g.:
	tree.collapse_unary(collapsePOS=False)  # Remove branches A-B-C into A-B+C
	tree.chomsky_normal_form(horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D

	productions += tree.productions()

# Print the grammar
S = Nonterminal('S')
grammar = induce_pcfg(S, productions)
print grammar
from nltk.treetransforms import chomsky_normal_form


'''
tbank_productions = set(production for sent in treebank.parsed_sents()
                        for production in sent.productions())

'''

treebank_prods = []
for i in range(199): # for all found sets of fileids
    tbstuff = treebank._fileids[i] # get a bunch of 'em
    for tree in treebank.parsed_sents(tbstuff):
        tree.chomsky_normal_form()

        treebank_prods += tree.productions()



tTCpcfg = nltk.induce_pcfg(Nonterminal('S'), list(treebank_prods))

# induce pcfg

# PTCpcfg = nltk.induce_pcfg(tbank_grammar)

# treetransforms: chomsky_normal_form

print("done! You have your WeightedGrammar")


#tree.chomsky_normal_form(horzMarkov = 2) 

# srules is the set of non-lexical rules with duplicates removed
srules = list(set(drules))
print len(srules)

# slex is the set of lexical rules with duplicates removed
slex = list(set(plex))
print len(slex)

# create a nonprobabilistic parser
grammar = ContextFreeGrammar(Nonterminal('S'),srules+slex)
parser = nltk.parse.chart.ChartParser(grammar)

# create a probabilistic parser
wgrammar = nltk.induce_pcfg(Nonterminal('S'),plex+drules)
wparser = nltk.parse.viterbi.ViterbiParser(wgrammar)

# try out the probabilistic parser on a few sentences
t = "that is very interesting .".split(' ')
trees = wparser.nbest_parse(t,n=10)
print trees

u = "from the beginning to the end , it was an entertaining game .".split(' ')
trees = wparser.nbest_parse(u,n=10)
print trees

v = "the workers dumped sacks into a bin .".split(' ')
trees = wparser.nbest_parse(v,n=10)
print trees
print trees[0].productions()