Ejemplo n.º 1
0
def main(args):

    sentence = args.sentence.lower()
    args.sentence = sentence
    tokens = sentence.split()
    grammar = loadGrammar(args)
    nonterm = getnonterm(grammar)
    terminalProductionRules = getTerminalProbability(args, grammar, nonterm)
    HSrules = grammar.productions(Nonterminal('HS'))
    for rule in HSrules:
        grammar.productions().remove(rule)

    ESrules = grammar.productions(Nonterminal('ES'))
    for rule in ESrules:
        grammar.productions().remove(rule)

    grammar.productions().extend(terminalProductionRules)

    for token in tokens:
        grammar.productions().append(
            ProbabilisticProduction(Nonterminal(token.upper()),
                                    [unicode(token)],
                                    prob=1))

    #print "Grammars"
    grammarlist = str(grammar).split('\n')[1:]

    #print "Transfered"
    strgrammar = ''
    for p in grammar.productions():
        rhs = p.rhs()
        rhsstr = ''
        for r in rhs:
            if is_terminal(r):
                rhsstr += '\'' + str(r) + '\' '
            else:
                rhsstr += str(r) + ' '
        strgrammar += str(p.lhs()) + ' -> ' + rhsstr + ' [' + '{0:.8f}'.format(
            p.prob()) + ']\n'
    #print strgrammar

    grammar = PCFG.fromstring(strgrammar.split('\n'))
    #'''
    #grammar = loadGrammar(args)

    #tokens = args.sentence.lower().split()
    #nonterm = getnonterm(grammar)

    CYK(tokens, nonterm, grammar)
    #with open(args.grammar_file, 'r') as f:
    #        content = f.read()

    #trees = corpus2trees(content)
    #productions = trees2productions(trees)
    #listnonterm = []
    #grammar = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal('SS'), productions)
    #print grammar

    #'''
    '''
Ejemplo n.º 2
0
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD):
    print("\ninitializing...")
    global ALPHA
    global LPG_DIFF_THRESHOLD
    global MC_THRESHOLD
    global and_symb_count
    global or_symb_count
    global ignore_mc_ec
    ALPHA = alpha
    LPG_DIFF_THRESHOLD = gd_thr
    MC_THRESHOLD = mc_thr
    and_symb_count = 0
    or_symb_count = 0
    ignore_mc_ec = False
    
    ## create an empty grammar G
    S = Nonterminal("_START_")
    R = [ProbabilisticProduction(S, [""], prob=1.)]
    G = PCFG(S, R)
    
    T = _create_t(C) # create a table T
    
    ## repeat until no further rule to be learned
    i = 0
    while not _finished(T):
        i += 1
        print("\niter. n° %d" % (i,))
        found, G, C, T, N = _learning_by_biclustering(G, C, T)
        if not found:
            print("NO MORE RULES CAN BE LEARNED")
            break
        G, C, T = _attaching(N, G, C, T)
    G = _postprocessing(G, C)
    print("\n", G) # DEBUG
    return G
Ejemplo n.º 3
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        start -- start symbol.
        horzMarkov -- None for default. A number n >= 0 for horizontal markov.
        """
        self.start = start

        count_Y_Z = defaultdict(lambda: defaultdict(int))
        count_X = defaultdict(int)
        for t in parsed_sents:
            # it's a copy of tree. We don't want to modify the original tree.
            # mutable structures
            unle_trees = unlexicalize(t.copy(deep=True))
            # chomsky normal form with horizontal markov.
            unle_trees.chomsky_normal_form(horzMarkov=horzMarkov)
            # collapse subtrees with a single child.
            unle_trees.collapse_unary(collapsePOS=True)
            for prod in unle_trees.productions():
                count_Y_Z[prod.lhs()][prod.rhs()] += 1
                count_X[prod.lhs()] += 1

        # create a list of productions.
        productions = []
        for X, c_X in count_X.items():
            for (Y_Z, c_Y_Z) in count_Y_Z[X].items():
                q = c_Y_Z / float(c_X)
                productions.append(ProbabilisticProduction(X, Y_Z, prob=q))

        self.production = productions

        grammar = PCFG(Nonterminal(start), productions)
        self.parser = CKYParser(grammar)
Ejemplo n.º 4
0
def random_sentences(grammar_string, n=None, depth=5):
    grammar = PCFG.fromstring(grammar_string)
    i = 0
    while True:
        if i == n: return
        tree = generate(grammar, depth=depth)
        yield ' '.join(utils.flatten(tree))
        i += 1
Ejemplo n.º 5
0
def test1(test_str):
  toy_pcfg = PCFG.fromstring("""
  S -> A A [0.8] | A B [.1] | A S [.1]
  A -> A A [.6] | A B [.2] | 'a' [.2]
  B -> B A [.3] | A B [.2] | 'b' [.5]
  """)
  for t in CYK(toy_pcfg,test_str.split(" "),5):
    t.draw()
Ejemplo n.º 6
0
def test1(test_str):
    toy_pcfg = PCFG.fromstring(
        """
  S -> A A [0.8] | A B [.1] | A S [.1]
  A -> A A [.6] | A B [.2] | 'a' [.2]
  B -> B A [.3] | A B [.2] | 'b' [.5]
  """
    )
    for t in CYK(toy_pcfg, test_str.split(" "), 5):
        t.draw()
Ejemplo n.º 7
0
def loadPCFG(input_PCFG_filename):
    '''
    Loader function for pcfg
    args:input_PCFG_filename(str) a location of a file
    returns:nltk.grammar.PCFG object
    '''
    string = ''
    with open(input_PCFG_filename,'r') as f:
        for l in f:
            string += l
    return PCFG.fromstring(string)
Ejemplo n.º 8
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # { A -> B : count(A -> B) }
        productions_counts = defaultdict(int)
        # { A : count(A) }
        lhs_count = defaultdict(int)  # left_hand_side_count

        self.start = start  # Para la gramatica del parser CKY
        self.prods = []  # Lista de producciones

        # Hacemos una copia de t porque al hacer el unlexicalize, este me
        # modifica el arbol
        # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents]
        unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents]

        for t in unlex_sents:
            t.chomsky_normal_form(horzMarkov=horzMarkov)
            t.collapse_unary(collapsePOS=True, collapseRoot=True)
            for prod in t.productions():
                # type(prod): <class 'nltk.grammar.Production'>
                # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
                # type(prod.rhs): <class 'tuple'>
                #   Cada elemento de prod.rhs() es del tipo:
                #       <class 'nltk.grammar.Nonterminal'>
                productions_counts[prod] += 1
                lhs_count[prod.lhs()] += 1

        for prod, count_prod in productions_counts.items():
            # type(production): <class 'nltk.grammar.Production'>
                # production : A -> B
                # type(count_prod): int
            # count_prod : count(A -> B)
            count_lhs = lhs_count.get(prod.lhs(), 0)

            # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
            # type(prod.rhs): <class 'tuple'>
            q_ML = float(count_prod) / count_lhs
            self.prods += [ProbabilisticProduction(prod.lhs(),
                                                   prod.rhs(),
                                                   prob=q_ML)]
            # Cada elemento de self.prods es del tipo:
            #     <class 'nltk.grammar.ProbabilisticProduction'>

        # type(PCFG(...)) = <class 'nltk.grammar.PCFG'>
        # PCFG(start, productions)
        #       type(start): Nonterminal
        #       type(productions): list(Production)
        grammar = PCFG(Nonterminal(start), self.prods)
        self.my_parser = CKYParser(grammar)
Ejemplo n.º 9
0
def fill_missing_words(grammar: PCFG, missing_words: Set[str]):
    # UNK -> word1 | word2 | ... | wordN
    unknown = Nonterminal('UNK')
    unk_rules = [
        Production(unknown, [missing_word]) for missing_word in missing_words
    ]

    # Add UNK as a possibility to all rules with strings in the right hand side
    corrected_rules: List[Nonterminal] = []
    rule: ProbabilisticProduction
    for rule in grammar.productions():

        # right hand side has a string somewhere
        if any(isinstance(element, str) for element in rule.rhs()):

            # rule has already been corrected
            if rule.lhs() in corrected_rules:
                continue

            unk_rules.append(Production(rule.lhs(), [unknown]))

            corrected_rules.append(rule.lhs())

    return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)
Ejemplo n.º 10
0
def _learning_by_biclustering(G, C, T):
    print("learning...")
    global biclusters
    global ignore_mc_ec
    
    ## find the valid bicluster Bc in T that leads to the maximal posterior gain (Eq.2)
    BC = None
    
    ## 1er essai
    attempts = 3
    while BC is None and attempts > 0:
        attempts -= 1
        BC = _get_best_bicluster(T, C)
    
    if BC is None:
        ignore_mc_ec = True
    
        ## 2e essai
        attempts = 2
        while BC is None and attempts > 0:
            attempts -= 1
            BC = _get_best_bicluster(T, C)
    
        if BC is None:
            return False, G, C, T, None
        ignore_mc_ec = False
        
    ## create an AND symbol N and two OR symbols A, B
    N = Nonterminal("_AND_"+str(_get_and_symb_index()))
    A = Nonterminal("_OR_"+str(_get_or_symb_index()))
    B = Nonterminal("_OR_"+str(_get_or_symb_index()))
    bc = BC.as_matrix()
    s = np.sum(bc)
    row_prob = np.sum(bc, 1)/s
    col_prob = np.sum(bc, 0)/s
    ## création des règles
    rules = []
    rules += [ProbabilisticProduction(A, [_format_nt(BC.index[i])], prob=row_prob[i])
              for i in range(BC.shape[0])]
    rules += [ProbabilisticProduction(B, [_format_nt(BC.columns[j])], prob=col_prob[j])
              for j in range(BC.shape[1])]
    rules += [ProbabilisticProduction(N, [A, B], prob=1.)]
    ## mises à jour
    G_updated = PCFG(G.start(), G.productions() + rules) # ajout des règles dans G
    C_reduced = _reduce_corpus(C, BC, N) # réduction du corpus
    T_updated = _create_t(C_reduced) # mise à jour de T
    biclusters[(N.symbol(),A.symbol(),B.symbol())] = BC # sauvegarde de BC pour le groupe appris
    return True, G_updated, C_reduced, T_updated, N
Ejemplo n.º 11
0
def langley_2(depth=5, n=500):
    G = PCFG.fromstring("""
    S -> NP VP [1.0]
    VP -> V NP [1.0]
    NP -> Det N [0.5] | Det N RC [0.5]
    RC -> Rel VP [1.0]
    Det -> 'the' [0.5] | 'a' [0.5]
    V -> 'saw' [0.5] | 'heard' [0.5]
    N -> 'cat' [0.3333] | 'dog' [0.3333] | 'mouse' [0.3333]
    Rel -> 'that' [1.0]
    """)
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C
Ejemplo n.º 12
0
def baseline(depth=5, n=500):
    ## symboles non terminaux
    S = Nonterminal("S")
    NP = Nonterminal("NP")
    VP = Nonterminal("VP")
    PP = Nonterminal("PP")
    Det = Nonterminal("Det")
    Vt = Nonterminal("Vt")
    Vc = Nonterminal("Vc")
    Vi = Nonterminal("Vi")
    N = Nonterminal("N")
    P = Nonterminal("P")
    ## règles de production probabilistes
    R = [
        ProbabilisticProduction(S, [NP, VP], prob=1.),
        ProbabilisticProduction(NP, [Det, N], prob=1.),
        ProbabilisticProduction(VP, [Vt, NP], prob=1 / 3),
        ProbabilisticProduction(VP, [Vc, PP], prob=1 / 3),
        ProbabilisticProduction(VP, [Vi], prob=1 / 3),
        ProbabilisticProduction(PP, [P, NP], prob=1.),
        ProbabilisticProduction(Det, ["a"], prob=.5),
        ProbabilisticProduction(Det, ["the"], prob=.5),
        ProbabilisticProduction(Vt, ["touches"], prob=.5),
        ProbabilisticProduction(Vt, ["covers"], prob=.5),
        ProbabilisticProduction(Vi, ["rolls"], prob=.5),
        ProbabilisticProduction(Vi, ["bounces"], prob=.5),
        ProbabilisticProduction(Vc, ["is"], prob=1.),
        ProbabilisticProduction(N, ["circle"], prob=1 / 3),
        ProbabilisticProduction(N, ["square"], prob=1 / 3),
        ProbabilisticProduction(N, ["triangle"], prob=1 / 3),
        ProbabilisticProduction(P, ["above"], prob=.5),
        ProbabilisticProduction(P, ["below"], prob=.5)
    ]
    G = PCFG(S, R)  # grammaire
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C
Ejemplo n.º 13
0
def langley_1(depth=5, n=500):
    ## symboles non terminaux
    S = Nonterminal("S")
    NP = Nonterminal("NP")
    VP = Nonterminal("VP")
    AP = Nonterminal("AP")
    Adj = Nonterminal("Adj")
    Det = Nonterminal("Det")
    Vt = Nonterminal("Vt")
    Vi = Nonterminal("Vi")
    N = Nonterminal("N")
    ## règles de production probabilistes
    R = [
        ProbabilisticProduction(S, [NP, VP], prob=1.),
        ProbabilisticProduction(VP, [Vi], prob=.5),
        ProbabilisticProduction(VP, [Vt, NP], prob=.5),
        ProbabilisticProduction(NP, [Det, N], prob=.5),
        ProbabilisticProduction(NP, [Det, AP, N], prob=.5),
        ProbabilisticProduction(AP, [Adj], prob=.5),
        ProbabilisticProduction(AP, [Adj, AP], prob=.5),
        ProbabilisticProduction(Det, ["the"], prob=1.),
        ProbabilisticProduction(Vt, ["saw"], prob=.5),
        ProbabilisticProduction(Vt, ["heard"], prob=.5),
        ProbabilisticProduction(Vi, ["ate"], prob=.5),
        ProbabilisticProduction(Vi, ["slept"], prob=.5),
        ProbabilisticProduction(N, ["cat"], prob=.5),
        ProbabilisticProduction(N, ["dog"], prob=.5),
        ProbabilisticProduction(Adj, ["big"], prob=.5),
        ProbabilisticProduction(Adj, ["old"], prob=.5)
    ]
    G = PCFG(S, R)  # grammaire
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C
Ejemplo n.º 14
0
def _postprocessing(G, C):
    print("\npostprocessing...")
    ## suppression de la règle _START_ -> ...
    rules = []
    for prod in G.productions():
        if G.start().symbol() not in prod.lhs().symbol():
            rules.append(prod)
    if len(rules) == 0:
        return G
    ## create an OR symbol S
    S = Nonterminal("_START_")
    sss = {} # single symbol sentences
    ## for each sentence s in C do
    ##   if s is fully reduced to a single symbol x then
    ##   add S -> x to G, or if the rule already exists, increase its weight by 1
    for sentence in sent_tokenize(C):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        t = word_tokenize(sentence)
        if len(t) == 1:
            sss[t[0]] = 1 if not t[0] in sss else sss[t[0]] + 1
    weight_sum = sum([sss[k] for k in sss])
    rules += [ProbabilisticProduction(S, [_format_nt(k)], prob=sss[k]/weight_sum) for k in sss]
    return PCFG(S, rules)
Ejemplo n.º 15
0
    def test_parse_ambiguity(self):
        # Ejemplo tomado de las paginas 4, 5, 8 de las notas de Michael Collins
        # Probabilistic Context-Free Grammars (PCFGs)
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]

                VP -> Vt NP             [0.65]
                VP -> VP PP             [0.35]

                NP -> DT NN             [0.8]
                NP -> NP PP             [0.2]

                PP -> IN NP             [1.0]

                Vt -> saw               [1.0]

                NN -> man               [0.2]
                NN -> telescope         [0.3]
                NN -> dog               [0.5]

                DT -> the               [1.0]

                IN -> with              [1.0]
            """)

        # Cambiando esto:
        # VP -> Vt NP             [0.85]
        # VP -> VP PP             [0.15]
        # Obtengo el otro arbol

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog with the telescope'.split())

        # draw_trees(t)

        # check tree
        t2 = Tree.fromstring("""
                    (S
                        (NP
                            (DT the)
                            (NN man)
                        )
                        (VP
                            (VP
                                (Vt saw)
                                (NP
                                    (DT the)
                                    (NN dog)
                                )
                            )
                            (PP
                                (IN with)
                                (NP
                                    (DT the)
                                    (NN telescope)
                                )
                            )
                        )
                    )
                """)

        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.8 * 1.0 * 0.2 * 0.35 * 0.65 * 1.0 * 0.8 * 1.0 *
                   0.5 * 1.0 * 1.0 * 0.8 * 1.0 * 0.3)

        self.assertAlmostEqual(lp, lp2)
import nltk
from nltk.corpus import treebank
from itertools import islice
from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
gram2 = PCFG.fromstring("""
	A -> B B [.3] | C B C [.7]
	B -> B D [.5] | C [.5]
	C -> 'a' [.1] | 'b' [0.9]
	D -> 'b' [1.0]
	""")
prod1 = gram2.productions()[0]
print(prod1)
prod2 = gram2.productions()[1]
print(prod2)
print(prod2.lhs())
print(prod2.rhs())
print((prod2.prob()))
print(gram2.start())
print(gram2.productions())
Ejemplo n.º 17
0
    def test_parse(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                NP -> Det Noun          [0.6]
                NP -> Noun Adj          [0.4]
                VP -> Verb NP           [1.0]
                Det -> 'el'             [1.0]
                Noun -> 'gato'          [0.9]
                Noun -> 'pescado'       [0.1]
                Verb -> 'come'          [1.0]
                Adj -> 'crudo'          [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('el gato come pescado crudo'.split())

        # check chart
        pi = {
            (1, 1): {
                'Det': log2(1.0)
            },
            (2, 2): {
                'Noun': log2(0.9)
            },
            (3, 3): {
                'Verb': log2(1.0)
            },
            (4, 4): {
                'Noun': log2(0.1)
            },
            (5, 5): {
                'Adj': log2(1.0)
            },
            (1, 2): {
                'NP': log2(0.6 * 1.0 * 0.9)
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': log2(0.4 * 0.1 * 1.0)
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)
            },
            (1, 4): {},
            (2, 5): {},
            (1, 5): {
                'S':
                log2(1.0) +  # rule S -> NP VP
                log2(0.6 * 1.0 * 0.9) +  # left part
                log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)
            },  # right part
        }
        self.assertEqualPi(parser._pi, pi)

        # check partial results
        bp = {
            (1, 1): {
                'Det': Tree.fromstring("(Det el)")
            },
            (2, 2): {
                'Noun': Tree.fromstring("(Noun gato)")
            },
            (3, 3): {
                'Verb': Tree.fromstring("(Verb come)")
            },
            (4, 4): {
                'Noun': Tree.fromstring("(Noun pescado)")
            },
            (5, 5): {
                'Adj': Tree.fromstring("(Adj crudo)")
            },
            (1, 2): {
                'NP': Tree.fromstring("(NP (Det el) (Noun gato))")
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))")
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP':
                Tree.fromstring(
                    "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))")
            },
            (1, 4): {},
            (2, 5): {},
            (1, 5): {
                'S':
                Tree.fromstring("""(S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                   )
                """)
            },
        }
        self.assertEqual(parser._bp, bp)

        # check tree
        t2 = Tree.fromstring("""
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)
        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0)
        self.assertAlmostEqual(lp, lp2)
Ejemplo n.º 18
0
beta_20 = round(beta/20,4) #se redondea a cuatro decimales
print ("beta_20: " + str(beta_20))
#beta_20 y compl_beta_20 deben sumar siempre 0.05
compl_beta_20 = 0.05 - beta_20
print ("compl_beta_20: " + str(compl_beta_20))

grammar = """
    S -> A V1 B [0.5] | C W1 D [0.45] | C W2 D [""" + str(beta_20) + """] | A W2 B [""" + str(compl_beta_20) + """]
    A -> 'a0' [0.1] | 'a1' [0.1] | 'a2' [0.1] | 'a3' [0.1] | 'a4' [0.1] | 'a5' [0.1] | 'a6' [0.1] | 'a7' [0.1] | 'a8' [0.1] | 'a9' [0.1]
    B -> 'b0' [0.1] | 'b1' [0.1] | 'b2' [0.1] | 'b3' [0.1] | 'b4' [0.1] | 'b5' [0.1] | 'b6' [0.1] | 'b7' [0.1] | 'b8' [0.1] | 'b9' [0.1]
    C -> 'c0' [0.1] | 'c1' [0.1] | 'c2' [0.1] | 'c3' [0.1] | 'c4' [0.1] | 'c5' [0.1] | 'c6' [0.1] | 'c7' [0.1] | 'c8' [0.1] | 'c9' [0.1]
    D -> 'd0' [0.1] | 'd1' [0.1] | 'd2' [0.1] | 'd3' [0.1] | 'd4' [0.1] | 'd5' [0.1] | 'd6' [0.1] | 'd7' [0.1] | 'd8' [0.1] | 'd9' [0.1]    
    V1 -> 'v0' [0.02] | 'v1' [0.02] | 'v2' [0.02] | 'v3' [0.02] | 'v4' [0.02] | 'v5' [0.02] | 'v6' [0.02] | 'v7' [0.02] | 'v8' [0.02] | 'v9' [0.02] | 'v10' [0.02] | 'v11' [0.02] | 'v12' [0.02] | 'v13' [0.02] | 'v14' [0.02] | 'v15' [0.02] | 'v16' [0.02] | 'v17' [0.02] | 'v18' [0.02] | 'v19' [0.02] | 'v20' [0.02] | 'v21' [0.02] | 'v22' [0.02] | 'v23' [0.02] | 'v24' [0.02] | 'v25' [0.02] | 'v26' [0.02] | 'v27' [0.02] | 'v28' [0.02] | 'v29' [0.02] | 'v30' [0.02] | 'v31' [0.02] | 'v32' [0.02] | 'v33' [0.02] | 'v34' [0.02] | 'v35' [0.02] | 'v36' [0.02] | 'v37' [0.02] | 'v38' [0.02] | 'v39' [0.02] | 'v40' [0.02] | 'v41' [0.02] | 'v42' [0.02] | 'v43' [0.02] | 'v44' [0.02] | 'v45' [0.02] | 'v46' [0.02] | 'v47' [0.02] | 'v48' [0.02] | 'v49' [0.02]
    W1 ->'w5' [0.0232] | 'w6' [0.0222] | 'w7' [0.0222] | 'w8' [0.0222] | 'w9' [0.0222] | 'w10' [0.0222] | 'w11' [0.0222] | 'w12' [0.0222] | 'w13' [0.0222] | 'w14' [0.0222] | 'w15' [0.0222] | 'w16' [0.0222] | 'w17' [0.0222] | 'w18' [0.0222] | 'w19' [0.0222] | 'w20' [0.0222] | 'w21' [0.0222] | 'w22' [0.0222] | 'w23' [0.0222] | 'w24' [0.0222] | 'w25' [0.0222] | 'w26' [0.0222] | 'w27' [0.0222] | 'w28' [0.0222] | 'w29' [0.0222] | 'w30' [0.0222] | 'w31' [0.0222] | 'w32' [0.0222] | 'w33' [0.0222] | 'w34' [0.0222] | 'w35' [0.0222] | 'w36' [0.0222] | 'w37' [0.0222] | 'w38' [0.0222] | 'w39' [0.0222] | 'w40' [0.0222] | 'w41' [0.0222] | 'w42' [0.0222] | 'w43' [0.0222] | 'w44' [0.0222] | 'w45' [0.0222] | 'w46' [0.0222] | 'w47' [0.0222] | 'w48' [0.0222] | 'w49' [0.0222] 
    W2 -> 'w0' [0.2] | 'w1' [0.2] | 'w2' [0.2] | 'w3' [0.2] | 'w4' [0.2] 

"""
print(grammar)

ambiguity = PCFG.fromstring(grammar)
print(ambiguity)


archivo_destino = 'ambiguity_alfa_' + str(alfa) + '.txt'
with open(archivo_destino, 'w') as f:
    for sentence in pcfg_generate(ambiguity, n=100000, depth=100):
       f.write(' '.join(sentence) +'\n')



Ejemplo n.º 19
0
def demo(choice=None, draw_parses=None, print_parses=None):
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    if choice is None:
        # Ask the user which demo they want to use.
        print()
        for i in range(len(demos)):
            print('%3s: %s' % (i+1, demos[i][0]))
            print('     %r' % demos[i][1])
            print()
        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
        choice = int(sys.stdin.readline().strip())-1
    try:
        sent, grammar = demos[choice]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)   # was BeamParser
        ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
        parser.trace(3)
        t = time.time()
        parses = list(parser.parse(tokens))
        times.append(time.time()-t)
        p = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0)
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses: all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print('       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
    print('------------------------+------------------------------------------')
    for i in range(len(parsers)):
        print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                             parsers[i].beam_size,
                                             times[i],num_parses[i],average_p[i]))
    parses = all_parses.keys()
    if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
    else: p = 0
    print('------------------------+------------------------------------------')
    print('%18s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))

    if draw_parses is None:
        # Ask the user if we should draw the parses.
        print()
        print('Draw parses (y/n)? ', end=' ')
        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
    if draw_parses:
        from nltk.draw.tree import draw_trees
        print('  please wait...')
        draw_trees(*parses)

    if print_parses is None:
        # Ask the user if we should print the parses.
        print()
        print('Print parses (y/n)? ', end=' ')
        print_parses = sys.stdin.readline().strip().lower().startswith('y')
    if print_parses:
        for parse in parses:
            print(parse)
Ejemplo n.º 20
0
    def test_parse_2(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                NP -> NP PP             [0.5]
                NP -> Det Noun          [0.5]
                VP -> VP PP             [0.9]
                VP -> Verb NP           [0.1]
                PP -> Prep NP           [1.0]
                Noun -> 'dog'           [0.2]
                Noun -> 'man'           [0.2]
                Noun -> 'town'          [0.6]
                Verb -> 'saw'           [1.0]
                Prep -> 'in'            [1.0]
                Det -> 'the'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog in the town'.split())

        # check chart
        pi = {
            (1, 1): {
                'Det': log2(1.0)
            },
            (2, 2): {
                'Noun': log2(0.2)
            },
            (3, 3): {
                'Verb': log2(1.0)
            },
            (4, 4): {
                'Det': log2(1.0)
            },
            (5, 5): {
                'Noun': log2(0.2)
            },
            (6, 6): {
                'Prep': log2(1.0)
            },
            (7, 7): {
                'Det': log2(1.0)
            },
            (8, 8): {
                'Noun': log2(0.6)
            },
            (1, 2): {
                'NP': -3.321928094887362
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': -3.321928094887362
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': -1.736965594166206
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': -6.643856189774724
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP': -1.736965594166206
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S': -9.965784284662087
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP': -6.058893689053567
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP': -8.53282487738598
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S': -11.854752972273342
            },
        }
        self.assertEqualPi(parser._pi, pi)

        bp = {
            (1, 1): {
                'Det': Tree.fromstring('(Det the)')
            },
            (2, 2): {
                'Noun': Tree.fromstring('(Noun man)')
            },
            (3, 3): {
                'Verb': Tree.fromstring('(Verb saw)')
            },
            (4, 4): {
                'Det': Tree.fromstring('(Det the)')
            },
            (5, 5): {
                'Noun': Tree.fromstring('(Noun dog)')
            },
            (6, 6): {
                'Prep': Tree.fromstring('(Prep in)')
            },
            (7, 7): {
                'Det': Tree.fromstring('(Det the)')
            },
            (8, 8): {
                'Noun': Tree.fromstring('(Noun town)')
            },
            (1, 2): {
                'NP': Tree.fromstring('(NP (Det the) (Noun man))')
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': Tree.fromstring('(NP (Det the) (Noun dog))')
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': Tree.fromstring('(NP (Det the) (Noun town))')
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP':
                Tree.fromstring('(VP (Verb saw) (NP (Det the) (Noun dog)))')
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP':
                Tree.fromstring('(PP (Prep in) (NP (Det the) (Noun town)))')
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S':
                Tree.fromstring("""(S
                      (NP (Det the) (Noun man))
                      (VP (Verb saw) (NP (Det the) (Noun dog))))""")
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP':
                Tree.fromstring("""(NP
                      (NP (Det the) (Noun dog))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP':
                Tree.fromstring("""(VP
                      (VP (Verb saw) (NP (Det the) (Noun dog)))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S':
                Tree.fromstring("""(S
                      (NP (Det the) (Noun man))
                      (VP
                        (VP (Verb saw) (NP (Det the) (Noun dog)))
                        (PP (Prep in) (NP (Det the) (Noun town)))))""")
            },
        }

        self.assertEqual(parser._bp, bp)
from nltk.grammar import Nonterminal, PCFG
from pcfg_generate import *


grammar = """
    S -> 'a' V 'b' [0.25] | 'b' V 'a' [0.25] | 'a' W 'a' [0.125] | 'a' W 'b' [0.125] | 'b' W 'a' [0.125] | 'b' W 'b' [0.125]
    V -> 'v0' [0.2] | 'v1' [0.2] | 'v2' [0.2] | 'v3' [0.2] | 'v4' [0.2]
    W -> 'w0' [0.2] | 'w1' [0.2] | 'w2' [0.2] | 'w3' [0.2] | 'w4' [0.2]
"""
print(grammar)

nonconflation = PCFG.fromstring(grammar)

print(nonconflation)

"""
texto = ""
for sentence in pcfg_generate(nonconflation, n=100000, depth=6):
    #print(' '.join(sentence))
    #print(sentence)
    str1 = ' '.join(sentence)
    texto = texto  + str1 + '\n'
"""

with open('nonconflation.txt', 'w') as f:
    for sentence in pcfg_generate(nonconflation, n=100000, depth=100):
       f.write(' '.join(sentence) +'\n')


Ejemplo n.º 22
0
    def test_parse(self):
        grammar = PCFG.fromstring(
            """
                S -> NP VP              [1.0]
                NP -> Det Noun          [0.6]
                NP -> Noun Adj          [0.4]
                VP -> Verb NP           [1.0]
                Det -> 'el'             [1.0]
                Noun -> 'gato'          [0.9]
                Noun -> 'pescado'       [0.1]
                Verb -> 'come'          [1.0]
                Adj -> 'crudo'          [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('el gato come pescado crudo'.split())

        # check chart
        pi = {
            (1, 1): {'Det': log2(1.0)},
            (2, 2): {'Noun': log2(0.9)},
            (3, 3): {'Verb': log2(1.0)},
            (4, 4): {'Noun': log2(0.1)},
            (5, 5): {'Adj': log2(1.0)},

            (1, 2): {'NP': log2(0.6 * 1.0 * 0.9)},
            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': log2(0.4 * 0.1 * 1.0)},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)},

            (1, 4): {},
            (2, 5): {},

            (1, 5): {'S':
                     log2(1.0) +  # rule S -> NP VP
                     log2(0.6 * 1.0 * 0.9) +  # left part
                     log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)},  # right part
        }
        self.assertEqualPi(parser._pi, pi)

        # check partial results
        bp = {
            (1, 1): {'Det': Tree.fromstring("(Det el)")},
            (2, 2): {'Noun': Tree.fromstring("(Noun gato)")},
            (3, 3): {'Verb': Tree.fromstring("(Verb come)")},
            (4, 4): {'Noun': Tree.fromstring("(Noun pescado)")},
            (5, 5): {'Adj': Tree.fromstring("(Adj crudo)")},

            (1, 2): {'NP': Tree.fromstring("(NP (Det el) (Noun gato))")},
            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))")},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': Tree.fromstring(
                "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))")},

            (1, 4): {},
            (2, 5): {},

            (1, 5): {'S': Tree.fromstring(
                """(S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                   )
                """)},
        }
        self.assertEqual(parser._bp, bp)

        # check tree
        t2 = Tree.fromstring(
            """
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)
        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0)
        self.assertAlmostEqual(lp, lp2)
Ejemplo n.º 23
0
def extract_simple_pcfg(n):
    rules = extract_simple_productions(n)
    pcfg = grammar.induce_pcfg(Nonterminal("S"), rules)
    return PCFG(pcfg.start(), sort_rules(pcfg.productions()))
Ejemplo n.º 24
0
def _attaching(N, G, C, T):
    print("attaching...")
    C_derived = _apply_grammar(G, C)
    ORs = [] # liste des OR (NonTerminal)
    for prod in G.productions():
        nt = prod.lhs()
        if "OR" in nt.symbol() and nt not in ORs:
            ORs.append(nt)
    ## for each OR symbol O in G do
    for O in ORs:
        ## if O leads to a valid expanded bicluster
        ## as well as a posterior gain (Eq.3) larger than a threshold then
        
        #
        #   AND-OR group
        
        group = None
        pos = None # gauche ou droite (impair-False ou pair-True)
        ## récupération du groupe AND-OR de O
        for g in biclusters:
            if O.symbol() in g[1] or O.symbol() in g[2]:
                group = g
                break
        ## récupération de la position de O dand le groupe
        num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2
        pos = True if num % 2 == 0 else False
        
        #
        #   BC_tilde et BC_tilde_prime
        
        ## création de BC_t (BC_tilde)
        BC_t = biclusters[group].copy()
        ## remplissage de BC_t
        for pair in _get_bicluster_pairs(BC_t):
            BC_t.at[pair] = _count_occ(" ".join(pair), C_derived)
        ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND)
        BC_t_1 = BC_t.copy()
        ## . remplissage de BC_t_1
        if pos == False:
            ## new row (OR à gauche)
            new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns]
            BC_t_1.loc[N.symbol(),:] = new_row
            BC_t_1 = BC_t_1.astype(int)
        else:
            ## new column (OR à droite)
            new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index]
            BC_t_1.loc[:,N.symbol()] = new_col
            BC_t_1 = BC_t_1.astype(int)
        
        #
        #   EC_tilde et EC_tilde_prime

        ## création et remplissage de EC_t
        EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived))
        ## création de EC_t_1
        EC_t_1 = EC_t.copy()
        ## . ajout des nouvelles lignes de EC_t_1
        if pos == False:
            ## OR à gauche
            new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns]
        else:
            ## OR à droite
            new_row_indices = [(row,N.symbol()) for row in BC_t_1.index]
        ## . remplissage des nouvelles lignes de EC_t_1
        for i in new_row_indices:
            i_str = _tuple_to_ec_index(i, True)
            EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1]
            for j in EC_t_1.columns:
                e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte
                c = tuple(["" if _represents_int(x) else x for x in c])
                EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C)
        EC_t_1 = EC_t_1.astype(int)
        bc_t_1 = BC_t_1.as_matrix()
        ec_t_1 = EC_t_1.as_matrix()
        bc_t = BC_t.as_matrix()
        ec_t = EC_t.as_matrix()
        
        #
        #   LOG POSTERIOR GAIN DIFFERENCE (Eq.3)
        
        ## BC et EC valid (MC) ?
        if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t):
            continue
        
        lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1)
        lpg_diff -= _log_posterior_gain(bc_t, ec_t)
        
        if lpg_diff > LPG_DIFF_THRESHOLD:
            print("new rule: %s -> %s" % (O.symbol(),N.symbol()))
            bc = BC_t_1.as_matrix()
            s = np.sum(bc)
            row_prob = np.sum(bc, 1)/s
            col_prob = np.sum(bc, 0)/s
            ## règles
            rules = []
            for prod in G.productions():
                if O.symbol() not in prod.lhs().symbol():
                    rules.append(prod)
            ## ajout des nouvelles règles
            if pos == False:
                ## OR à gauche
                probs = row_prob
                rhs_symbols = [x for x in BC_t.index]+[N]
                for i in range(BC_t_1.shape[0]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i]))
            else:
                ## OR à droite
                probs = col_prob
                rhs_symbols = [x for x in BC_t.columns]+[N]
                for j in range(BC_t_1.shape[1]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j]))
                
            ## mises à jour
            biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR
            G = PCFG(G.start(), rules) # mise à jour de G
            C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C
            T = _create_t(C) # mise à jour de T
            
    return G, C, T
Ejemplo n.º 25
0
def loadGrammar(args):
    with open(args.grammar_file, 'r') as f:
        pcfg = PCFG.fromstring(f.read())
    return pcfg
Ejemplo n.º 26
0
            'Usage: %s grammar_file csv_file skiprows nrows (-1 for all) top_K'
            % sys.argv[0])
        print(
            'Example: %s grammar/airbnb_grammar.txt ../data/Airbnb/SanFrancisco_details.csv 0 3 20'
            % sys.argv[0])
        exit(0)
    grammar_txt = open(sys.argv[1]).read()
    csv_file = sys.argv[2]
    skiprows = int(sys.argv[3])
    nrows = int(sys.argv[4])
    if nrows == -1:
        nrows = None
    top_K = int(sys.argv[5])

    # print('Grammar:\n' + grammar_txt + '\n')
    grammar = PCFG.fromstring(grammar_txt)

    # Read CSV file into Pandas DataFrame
    # Handle DtypeWarning: Columns (43) have mixed types. [One entry with zipcode '94107-1273']
    # Fix dollar sign and thousands separators in 'price'
    df = pd.read_csv(csv_file,
                     skiprows=skiprows,
                     nrows=nrows,
                     dtype={
                         'host_id': np.str,
                         'zipcode': np.str
                     },
                     converters={
                         'price':
                         lambda s: float(s.replace('$', '').replace(',', ''))
                     })
Ejemplo n.º 27
0
grammar = PCFG.fromstring("""
    S -> negside eqside [0.5]
    S -> side eqside [0.5]
    digit -> '0' [0.1]
    digit -> '1' [0.1]
    digit -> '2' [0.1]
    digit -> '3' [0.1]
    digit -> '4' [0.1]
    digit -> '5' [0.1]
    digit -> '6' [0.1]
    digit -> '7' [0.1]
    digit -> '8' [0.1]
    digit -> '9' [0.1]
    div -> '/' [1.0]
    divnum -> div number [1.0]
    divnumvar -> divnum variable [1.0]
    dot -> '.' [1.0]
    eq -> '=' [1.0]
    eqside -> eq negside [0.5]
    eqside -> eq side [0.5]
    lparen -> '(' [1.0]
    minus -> '-' [1.0]
    minusterm -> minus term [1.0]
    negside -> minus side [1.0]
    number -> '0' [.05]
    number -> '1' [.05]
    number -> '2' [.05]
    number -> '3' [.05]
    number -> '4' [.05]
    number -> '5' [.05]
    number -> '6' [.05]
    number -> '7' [.05]
    number -> '8' [.05]
    number -> '9' [.05]
    number -> digit number [.05]
    number -> dot number [0.45]
    parenside -> lparen siderparen [1.0]
    plus -> '+' [1.0]
    plusterm -> plus term [1.0]
    rparen -> ')' [1.0]
    side -> '0' [.04]
    side -> '1' [.04]
    side -> '2' [.04]
    side -> '3' [.04]
    side -> '4' [.04]
    side -> '5' [.04]
    side -> '6' [.04]
    side -> '7' [.04]
    side -> '8' [.04]
    side -> '9' [.04]
    side -> 'x' [.04]
    side -> digit number [.04]
    side -> dot number [.04]
    side -> number divnum [.04]
    side -> number divnumvar [.04]
    side -> number parenside [.04]
    side -> number starnum [.04]
    side -> number vardivnum [.04]
    side -> number variable [.04]
    side -> parenside divnum [.04]
    side -> side minusterm [.04]
    side -> side plusterm [.16]
    siderparen -> negside rparen [0.5]
    siderparen -> side rparen [0.5]
    star -> '*' [1.0]
    starnum -> star number [1.0]
    term -> '0' [.05]
    term -> '1' [.05]
    term -> '2' [.05]
    term -> '3' [.05]
    term -> '4' [.05]
    term -> '5' [.05]
    term -> '6' [.05]
    term -> '7' [.05]
    term -> '8' [.05]
    term -> '9' [.05]
    term -> 'x' [.05]
    term -> digit number [.05]
    term -> dot number [.05]
    term -> number divnum [.05]
    term -> number divnumvar [.05]
    term -> number parenside [.05]
    term -> number starnum [.05]
    term -> number vardivnum [.05]
    term -> number variable [.05]
    term -> parenside divnum [.05]
    vardivnum -> variable divnum [1.0]
    variable -> 'x' [1.0]
    varstarnum -> variable starnum [1.0]
""")
Ejemplo n.º 28
0
    def test_parse_2(self):
        grammar = PCFG.fromstring(
            """
                S -> NP VP              [1.0]
                NP -> NP PP             [0.5]
                NP -> Det Noun          [0.5]
                VP -> VP PP             [0.9]
                VP -> Verb NP           [0.1]
                PP -> Prep NP           [1.0]
                Noun -> 'dog'           [0.2]
                Noun -> 'man'           [0.2]
                Noun -> 'town'          [0.6]
                Verb -> 'saw'           [1.0]
                Prep -> 'in'            [1.0]
                Det -> 'the'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog in the town'.split())

        # check chart
        pi = {
                 (1, 1): {'Det': log2(1.0)},
                 (2, 2): {'Noun': log2(0.2)},
                 (3, 3): {'Verb': log2(1.0)},
                 (4, 4): {'Det': log2(1.0)},
                 (5, 5): {'Noun': log2(0.2)},
                 (6, 6): {'Prep': log2(1.0)},
                 (7, 7): {'Det': log2(1.0)},
                 (8, 8): {'Noun': log2(0.6)},

                 (1, 2): {'NP': -3.321928094887362},
                 (2, 3): {},
                 (3, 4): {},
                 (4, 5): {'NP': -3.321928094887362},
                 (5, 6): {},
                 (6, 7): {},
                 (7, 8): {'NP': -1.736965594166206},

                 (1, 3): {},
                 (2, 4): {},
                 (3, 5): {'VP': -6.643856189774724},
                 (4, 6): {},
                 (5, 7): {},
                 (6, 8): {'PP': -1.736965594166206},

                 (1, 4): {},
                 (2, 5): {},
                 (3, 6): {},
                 (4, 7): {},
                 (5, 8): {},

                 (1, 5): {'S': -9.965784284662087},
                 (2, 6): {},
                 (3, 7): {},
                 (4, 8): {'NP': -6.058893689053567},

                 (1, 6): {},
                 (2, 7): {},
                 (3, 8): {'VP': -8.53282487738598},

                 (1, 7): {},
                 (2, 8): {},

                 (1, 8): {'S': -11.854752972273342},

                 }
        self.assertEqualPi(parser._pi, pi)

        bp = {
            (1, 1): {'Det': Tree.fromstring('(Det the)')},
            (2, 2): {'Noun': Tree.fromstring('(Noun man)')},
            (3, 3): {'Verb': Tree.fromstring('(Verb saw)')},
            (4, 4): {'Det': Tree.fromstring('(Det the)')},
            (5, 5): {'Noun': Tree.fromstring('(Noun dog)')},
            (6, 6): {'Prep': Tree.fromstring('(Prep in)')},
            (7, 7): {'Det': Tree.fromstring('(Det the)')},
            (8, 8): {'Noun': Tree.fromstring('(Noun town)')},
            (1, 2): {'NP': Tree.fromstring('(NP (Det the) (Noun man))')},

            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': Tree.fromstring('(NP (Det the) (Noun dog))')},
            (5, 6): {},
            (6, 7): {},
            (7, 8): {'NP': Tree.fromstring('(NP (Det the) (Noun town))')},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': Tree.fromstring(
                '(VP (Verb saw) (NP (Det the) (Noun dog)))')},
            (4, 6): {},
            (5, 7): {},
            (6, 8): {'PP': Tree.fromstring(
                '(PP (Prep in) (NP (Det the) (Noun town)))')},

            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},

            (1, 5): {'S': Tree.fromstring(
                """(S
                      (NP (Det the) (Noun man))
                      (VP (Verb saw) (NP (Det the) (Noun dog))))""")},
            (2, 6): {},
            (3, 7): {},
            (4, 8): {'NP': Tree.fromstring(
                """(NP
                      (NP (Det the) (Noun dog))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")},
            (1, 6): {},
            (2, 7): {},
            (3, 8): {'VP': Tree.fromstring(
                """(VP
                      (VP (Verb saw) (NP (Det the) (Noun dog)))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")},
            (1, 7): {},
            (2, 8): {},

            (1, 8): {'S': Tree.fromstring(
                """(S
                      (NP (Det the) (Noun man))
                      (VP
                        (VP (Verb saw) (NP (Det the) (Noun dog)))
                        (PP (Prep in) (NP (Det the) (Noun town)))))""")},
            }

        self.assertEqual(parser._bp, bp)
Ejemplo n.º 29
0
from nltk.grammar import Nonterminal, PCFG
from pcfg_generate import *


sparseness = PCFG.fromstring("""
    S -> A V B [0.5] | C W D [0.5] 
    A -> 'a0' [0.1] | 'a1' [0.1] | 'a2' [0.1] | 'a3' [0.1] | 'a4' [0.1] | 'a5' [0.1] | 'a6' [0.1] | 'a7' [0.1] | 'a8' [0.1] | 'a9' [0.1]
    B -> 'b0' [0.1] | 'b1' [0.1] | 'b2' [0.1] | 'b3' [0.1] | 'b4' [0.1] | 'b5' [0.1] | 'b6' [0.1] | 'b7' [0.1] | 'b8' [0.1] | 'b9' [0.1]
    C -> 'c0' [0.1] | 'c1' [0.1] | 'c2' [0.1] | 'c3' [0.1] | 'c4' [0.1] | 'c5' [0.1] | 'c6' [0.1] | 'c7' [0.1] | 'c8' [0.1] | 'c9' [0.1]
    D -> 'd0' [0.1] | 'd1' [0.1] | 'd2' [0.1] | 'd3' [0.1] | 'd4' [0.1] | 'd5' [0.1] | 'd6' [0.1] | 'd7' [0.1] | 'd8' [0.1] | 'd9' [0.1]
    V -> 'v0' [0.1] | 'v1' [0.1] | 'v2' [0.1] | 'v3' [0.1] | 'v4' [0.1] | 'v5' [0.1] | 'v6' [0.1] | 'v7' [0.1] | 'v8' [0.1] | 'v9' [0.1]    
    W -> 'w0' [0.1] | 'w1' [0.1] | 'w2' [0.1] | 'w3' [0.1] | 'w4' [0.1] | 'w5' [0.1] | 'w6' [0.1] | 'w7' [0.1] | 'w8' [0.1] | 'w9' [0.1]    
""")
print(sparseness)

texto = ""
context_ab = ""
context_cd = ""
for sentence in pcfg_generate(sparseness, n=100000, depth=6):
    #print(' '.join(sentence))
    #print(sentence)
    str1 = ' '.join(sentence)
    texto = texto + str1 + '\n' 

for i in range(10):
    context_ab = context_ab + 'a' +str(i) + ' ' + 'u' +str(i) + ' ' + 'b' +str(i) + ' '   + '\n' 
  	
for i in range(10):
    context_cd = context_cd + 'c' +str(i) + ' ' + 'x' +str(i) + ' ' + 'd' +str(i) + ' '   + '\n' 

texto = texto + context_ab + context_cd
Ejemplo n.º 30
0
grammar = PCFG.fromstring("""
    S -> side eqside [0.766312]
    side -> side plusterm [0.215816]
    side -> side minusterm [0.204728]
    side -> number variable [0.192571]
    number -> digit number [0.2953]
    digit -> '2' [0.20094]
    number -> '8' [0.0512296]
    variable -> 'x' [1.0]
    minusterm -> minus term [1.0]
    minus -> '-' [1.0]
    term -> digit number [0.185462]
    digit -> '6' [0.062679]
    number -> '0' [0.0918352]
    plusterm -> plus term [1.0]
    plus -> '+' [1.0]
    term -> number divnum [0.205817]
    number -> '2' [0.114112]
    divnum -> div number [1.0]
    div -> '/' [1.0]
    number -> '5' [0.0972758]
    eqside -> eq negside [0.402479]
    eq -> '=' [1.0]
    negside -> minus side [1.0]
    digit -> '1' [0.356414]
    S -> negside eqside [0.233688]
    side -> number divnum [0.0921555]
    digit -> '7' [0.0299559]
    number -> '3' [0.0644647]
    eqside -> eq side [0.597521]
    side -> 'x' [0.0759721]
    side -> '6' [0.00732374]
    digit -> '9' [0.0175681]
    number -> '6' [0.0557699]
    number -> '4' [0.0737897]
    number -> '7' [0.0733781]
    digit -> '3' [0.150809]
    term -> number variable [0.314671]
    side -> number divnumvar [0.0415636]
    divnumvar -> divnum variable [1.0]
    side -> digit number [0.074099]
    number -> dot number [0.0103797]
    dot -> '.' [1.0]
    number -> '1' [0.0461748]
    term -> 'x' [0.032113]
    digit -> '8' [0.0327874]
    digit -> '4' [0.0841404]
    digit -> '5' [0.0397696]
    number -> '9' [0.0262901]
    side -> '1' [0.0088784]
    side -> number parenside [0.0178505]
    parenside -> lparen siderparen [1.0]
    lparen -> '(' [1.0]
    siderparen -> side rparen [0.841012]
    term -> '5' [0.0212008]
    rparen -> ')' [1.0]
    term -> number parenside [0.0322466]
    term -> '3' [0.00984322]
    side -> parenside divnum [0.00844759]
    term -> parenside divnum [0.00383039]
    digit -> '0' [0.0249365]
    side -> '3' [0.00307185]
    term -> '4' [0.0399964]
    term -> '9' [0.0138963]
    term -> '1' [0.0294406]
    side -> '5' [0.00533828]
    term -> '8' [0.0189293]
    side -> '4' [0.00842886]
    side -> '2' [0.0126808]
    term -> '2' [0.035008]
    term -> number divnumvar [0.0251203]
    side -> '0' [0.0074174]
    side -> number vardivnum [0.00829774]
    vardivnum -> variable divnum [1.0]
    term -> number vardivnum [0.00944237]
    side -> dot number [0.0017045]
    side -> number starnum [0.00129243]
    starnum -> star number [1.0]
    star -> '*' [1.0]
    side -> '9' [0.00340901]
    term -> '7' [0.0130946]
    siderparen -> negside rparen [0.158988]
    term -> dot number [0.00218243]
    term -> number starnum [0.0014698]
    term -> '0' [0.00080171]
    side -> '8' [0.00681801]
    side -> '7' [0.00213531]
    term -> '6' [0.00543381]
""")
Ejemplo n.º 31
0
    def test_ambiguo(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                VP -> Vt NP             [0.3]
                VP -> VP PP             [0.7]
                NP -> NP PP             [0.6]
                NP -> DT NN             [0.4]
                PP -> IN NP             [1.0]
                Vt -> 'saw'             [1.0]
                NN -> 'man'             [0.33]
                NN -> 'telescope'       [0.33]
                NN -> 'dog'             [0.34]
                DT -> 'the'             [1.0]
                IN -> 'with'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog with the telescope'.split())

        pi = {
            (1, 1): {
                'DT': 0.0
            },
            (2, 2): {
                'NN': -1.5994620704162712
            },
            (3, 3): {
                'Vt': 0.0
            },
            (4, 4): {
                'DT': 0.0
            },
            (5, 5): {
                'NN': -1.5563933485243853
            },
            (6, 6): {
                'IN': 0.0
            },
            (7, 7): {
                'DT': 0.0
            },
            (8, 8): {
                'NN': -1.5994620704162712
            },
            (1, 2): {
                'NP': -2.9213901653036336
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': -2.8783214434117474
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': -2.9213901653036336
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': -4.6152870375779536
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP': -2.9213901653036336
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S': -7.536677202881587
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP': -6.536677202881587
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP': -8.051250375711346
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S': -10.972640541014979
            }
        }

        self.assertEqualPi(parser._pi, pi)

        t2 = Tree.fromstring("""
            (S
              (NP (DT the) (NN man))
              (VP
                (VP (Vt saw) (NP (DT the) (NN dog)))
                (PP (IN with) (NP (DT the) (NN telescope)))))
            """)

        self.assertEqual(t, t2)
Ejemplo n.º 32
0
def demo(choice=None, draw_parses=None, print_parses=None):
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    if choice is None:
        # Ask the user which demo they want to use.
        print()
        for i in range(len(demos)):
            print('%3s: %s' % (i + 1, demos[i][0]))
            print('     %r' % demos[i][1])
            print()
        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
        choice = int(sys.stdin.readline().strip()) - 1
    try:
        sent, grammar = demos[choice]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar,
                                 beam_size=len(tokens) + 1)  # was BeamParser
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
        parser.trace(3)
        t = time.time()
        parses = list(parser.parse(tokens))
        times.append(time.time() - t)
        p = (reduce(lambda a, b: a + b.prob(), parses, 0) /
             len(parses) if parses else 0)
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print(
        '       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
    print(
        '------------------------+------------------------------------------')
    for i in range(len(parsers)):
        print('%18s %4d |%11.4f%11d%19.14f' %
              (parsers[i].__class__.__name__, parsers[i].beam_size, times[i],
               num_parses[i], average_p[i]))
    parses = all_parses.keys()
    if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else: p = 0
    print(
        '------------------------+------------------------------------------')
    print('%18s      |%11s%11d%19.14f' %
          ('(All Parses)', 'n/a', len(parses), p))

    if draw_parses is None:
        # Ask the user if we should draw the parses.
        print()
        print('Draw parses (y/n)? ', end=' ')
        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
    if draw_parses:
        from nltk.draw.tree import draw_trees
        print('  please wait...')
        draw_trees(*parses)

    if print_parses is None:
        # Ask the user if we should print the parses.
        print()
        print('Print parses (y/n)? ', end=' ')
        print_parses = sys.stdin.readline().strip().lower().startswith('y')
    if print_parses:
        for parse in parses:
            print(parse)
Ejemplo n.º 33
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys
    import time

    from nltk import tokenize
    from nltk.grammar import PCFG
    from nltk.parse import ViterbiParser

    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw the man with my telescope", toy_pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope",
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print(f"{i + 1:>3}: {demos[i][0]}")
        print("     %r" % demos[i][1])
        print()
    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print("Bad sentence number")
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print("Time (secs)   # Parses   Average P(parse)")
    print("-----------------------------------------")
    print("%11.4f%11d%19.14f" % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print("------------------------------------------")
    print("%11s%11d%19.14f" % ("n/a", len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print("Draw parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        from nltk.draw.tree import draw_trees

        print("  please wait...")
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print("Print parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print(parse)