Example #1
0
def test_earley(grammar, tokens):
    earley_parser = nltk.EarleyChartParser(grammar, trace=1)
    e_chart = earley_parser.chart_parse(tokens)
    for edge in e_chart.edges():
        print edge, edge.end()

    print grammarutils.earley_predict(grammar, tokens)
def predict_next_symbols(grammar, tokens):
    tokens = remove_duplicate(tokens)
    symbols = list()
    earley_parser = nltk.EarleyChartParser(grammar, trace=0)
    try:
        e_chart = earley_parser.chart_parse(tokens)
    except ValueError:
        return list()
    end_edges = list()

    for edge in e_chart.edges():
        # print edge
        if edge.end() == len(tokens):
            # Only add terminal nodes
            if isinstance(edge.nextsym(), unicode):
                symbols.append(edge.nextsym())
                end_edges.append(edge)

    probs = list()
    for end_edge in end_edges:
        probs.append(get_edge_prob(end_edge, e_chart, grammar))

    # Eliminate duplicate
    symbols_no_duplicate = list()
    probs_no_duplicate = list()
    for s, p in zip(symbols, probs):
        if s not in symbols_no_duplicate:
            symbols_no_duplicate.append(s)
            probs_no_duplicate.append(p)
        else:
            probs_no_duplicate[symbols_no_duplicate.index(s)] += p

    return symbols_no_duplicate, probs_no_duplicate
def compute_grammar_prefix_probability(grammar, tokens):
    earley_parser = nltk.EarleyChartParser(grammar, trace=1)
    try:
        e_chart = earley_parser.chart_parse(tokens)
    except ValueError:
        return 0.0

    prob = 0
    # If the sentence is incomplete, return the sum of probabilities of all possible sentences
    for edge_idx, edge in enumerate(e_chart.edges()):
        if edge.end() == len(tokens) and len(edge.rhs()):
            # print('----------------------------')
            # print(edge)
            # print(edge.nextsym())
            if not edge.nextsym():
                # If the sentence is a valid complete sentence
                prob += get_edge_prob(edge,
                                      edge_idx,
                                      e_chart,
                                      grammar,
                                      level=0)
            elif isinstance(edge.nextsym(), str):
                # If the sentence is a valid prefix
                prob += get_edge_prob(edge,
                                      edge_idx,
                                      e_chart,
                                      grammar,
                                      level=0)
    return prob
def compute_sentence_probability(grammar, tokens):
    invalid_prob = 1e-20

    earley_parser = nltk.EarleyChartParser(grammar, trace=0)
    viterbi_parser = nltk.ViterbiParser(grammar)
    try:
        e_chart = earley_parser.chart_parse(tokens)
    except ValueError:
        return 0
        # d, tokens = find_closest_tokens(language, tokens)
        # return invalid_prob ** d

    # If the sentence is complete, return the Viterbi likelihood
    v_parses = viterbi_parser.parse_all(tokens)
    if v_parses:
        prob = functools.reduce(lambda a, b: a + b.prob(), v_parses,
                                0) / len(v_parses)
        return prob

    # If the sentence is incomplete, return the sum of probabilities of all possible sentences
    prob = 0
    for edge in e_chart.edges():
        if edge.end() == len(tokens) and isinstance(edge.nextsym(), str):
            prob += get_edge_prob(edge, e_chart, grammar)
    return prob
Example #5
0
def predict_next_symbols(grammar, tokens):
    def get_production_prob(selected_edge):
        # Find the corresponding production rule of the edge, and return its probability
        for production in grammar.productions(lhs=selected_edge.lhs()):
            if production.rhs() == selected_edge.rhs():
                # print selected_edge, production.prob()
                return production.prob()

    def find_parent(selected_edge):
        # Find the parent edges that lead to the selected edge
        p_edges = list()
        for p_edge in e_chart.edges():
            if p_edge.end() == selected_edge.start() and p_edge.nextsym(
            ) == selected_edge.lhs():
                p_edges.append(p_edge)
        return p_edges

    def get_edge_prob(selected_edge):
        # Compute the probability of the edge by recursion
        prob = get_production_prob(selected_edge)
        if selected_edge.start() != 0:
            parent_prob = 0
            for parent_edge in find_parent(selected_edge):
                parent_prob += get_edge_prob(parent_edge)
            prob *= parent_prob
        return prob

    symbols = list()
    earley_parser = nltk.EarleyChartParser(grammar, trace=0)
    e_chart = earley_parser.chart_parse(tokens)
    end_edges = list()

    for edge in e_chart.edges():
        # print edge
        if edge.end() == len(tokens):
            # Only add terminal nodes
            if isinstance(edge.nextsym(), unicode):
                symbols.append(edge.nextsym())
                end_edges.append(edge)

    probs = list()
    for end_edge in end_edges:
        probs.append(get_edge_prob(end_edge))

    # Eliminate duplicate
    symbols_no_duplicate = list()
    probs_no_duplicate = list()
    for s, p in zip(symbols, probs):
        if s not in symbols_no_duplicate:
            symbols_no_duplicate.append(s)
            probs_no_duplicate.append(p)
        else:
            probs_no_duplicate[symbols_no_duplicate.index(s)] += p

    return zip(symbols_no_duplicate, probs_no_duplicate)
Example #6
0
def count_valid():
    grammar = nltk.CFG.fromstring('\n'.join(raw_rules))
    parser = nltk.EarleyChartParser(grammar)

    cnt = 0
    for word in words:
        try:
            if len(list(parser.parse(word))):
                cnt += 1
        except:
            pass

    return cnt
Example #7
0
def test_grammar():
    paths = config.Paths()
    for pcfg in os.listdir(os.path.join(paths.tmp_root, 'grammar', 'cad')):
        if not pcfg.endswith('.pcfg'):
            continue
        grammar_file = os.path.join(paths.tmp_root, 'grammar', 'cad', pcfg)
        grammar = grammarutils.read_grammar(grammar_file, index=True, mapping=datasets.cad_metadata.subactivity_index)
        corpus_file = os.path.join(paths.tmp_root, 'corpus', 'cad', pcfg.replace('pcfg', 'txt'))
        with open(corpus_file, 'r') as f:
            for line in f:
                tokens = [str(datasets.cad_metadata.subactivity_index[token]) for token in line.strip(' *#\n').split(' ')]
                earley_parser = nltk.EarleyChartParser(grammar, trace=0)
                e_chart = earley_parser.chart_parse(tokens)
                print(e_chart.edges()[-1])
def get_prediciton_parse_tree(grammar, tokens, filename=None):
    complete_tokens = sample_complete_sentence(grammar, tokens)

    earley_parser = nltk.EarleyChartParser(grammar, trace=0)
    e_chart = earley_parser.chart_parse(complete_tokens)

    # Select the first parse tree
    parse_tree = e_chart.parses(grammar.start()).next()

    # Save the parse tree as an image file if a filename is given
    if filename:
        cf = nltk.draw.util.CanvasFrame()
        tc = nltk.draw.TreeWidget(cf.canvas(), parse_tree)

        # Customize your own graph
        tc['node_font'] = 'arial 45 bold'
        tc['leaf_font'] = 'arial 45'
        tc['node_color'] = '#005990'
        tc['leaf_color'] = '#3F8F57'
        tc['line_color'] = '#175252'
        tc['line_width'] = '5'
        tc['xspace'] = 20
        tc['yspace'] = 20

        # Set color for the past observations.
        # Note that tc._leaves has more nodes than the leaves of the tree, and the last ones are displayed.
        for i in range(
                len(tc._leaves) - len(complete_tokens),
                len(tc._leaves) - len(complete_tokens) + len(tokens) - 1):
            tc._leaves[i]['color'] = '#000000'
        # Set color for the current observation
        tc._leaves[len(tc._leaves) - len(complete_tokens) + len(tokens) -
                   1]['color'] = '#FF0000'

        cf.add_widget(tc, 10, 10)  # (10,10) offsets
        # cf.mainloop()
        cf.print_to_file(filename)
        cf.destroy()
        basename, ext = os.path.splitext(filename)
        os.system('convert {} {}'.format(filename, basename + '.png'))
        os.remove(filename)

    return parse_tree
Example #9
0
    def __init__(self, cache=None):
        """Inicializa un objecto de tipo AddressParser.

        Args:
            cache (dict): Ver atributo 'self._cache'.

        """
        self._parser = nltk.EarleyChartParser(_load_grammar(_GRAMMAR_PATH))

        self._token_regexp = re.compile(
            '|'.join('(?P<{}>{})'.format(*tt) for tt in _TOKEN_TYPES),
            re.IGNORECASE)

        self._separation_regexp = re.compile(_SEPARATION_REGEXP, re.IGNORECASE)

        self._normalization_regexp = re.compile(
            '|'.join(_NORMALIZATION_REGEXPS), re.IGNORECASE)

        self._cache = cache
Example #10
0
def parse_word_tag(word, tag, sentence):
    rule_perphrase_c = """S ->  DP | PP | AP | VP | CP | ADVP
            DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
            Dprime -> D | NP | D NP  | D CP 
            NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP | Nprime ADVP 
            Nprime -> N | N PP | PP N | N QP
            PP -> Pprime | Pprime ADVP | Pprime VP
            Pprime -> P | P DP
            AP -> Aprime | Aprime ADVP | Aprime AP | Aprime CP
            Aprime -> A | A DP 
            VP -> Vprime | Vprime ADVP | Vprime DP | Vprime CP 
            Vprime -> V | V DP | V PRN 
            CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime AP | Cprime QP | Cprime ADVP
            Cprime -> C | C Cprime
            QP ->  Qprime | Qprime CP
            Qprime -> Q | Q NP
            ADVP -> ADVprime | ADVprime QP | ADVprime DP  | ADVprime AP | ADVprime CP | ADVprime VP
            ADVprime -> ADV | ADV ADVP""" + '\n'

    rule_perphrase_b = """S ->  DP | PP | AP | VP | CP | ADV
            DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
            Dprime -> D | D NP | NP | D CP
            NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP 
            Nprime -> N | N PP | PP N 
            PP -> Pprime | Pprime ADV | Pprime VP
            Pprime -> P | P DP
            AP -> Aprime | Aprime ADV | Aprime AP | Aprime CP
            Aprime -> A | A DP 
            VP -> Vprime | Vprime ADV| Vprime DP | Vprime CP 
            Vprime -> V | V DP | V PRN 
            CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP | Cprime ADV
            Cprime -> C 
            QP ->  Qprime | Qprime CP
            Qprime -> Q""" + '\n'

    rule_perphrase_a = """S ->  DP | PP | AP | VP | CP | ADV
        DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
        Dprime -> D NP | NP | D CP
        NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP 
        Nprime -> N | N PP | PP N 
        PP -> Pprime | Pprime ADV | Pprime VP
        Pprime -> P | P DP
        AP -> Aprime | Aprime ADV
        Aprime -> A | A DP
        VP -> Vprime | Vprime ADV | Vprime DP 
        Vprime -> V | V DP | V PRN | Vprime CP 
        CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP
        Cprime -> C """ + '\n'

    rule_test_c = """S ->  DP Period | VP Period
    DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
    Dprime -> D NP | NP | D CP
    NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP
    Nprime -> N | N PP | PP N 
    PP -> Pprime | Pprime ADV | Pprime VP
    Pprime -> P | P DP
    AP -> Aprime | Aprime ADV
    Aprime -> A | A DP
    VP -> Vprime | Vprime ADV | Vprime DP
    Vprime -> V | V DP | V PRN | Vprime CP 
    CP -> Cprime | Cprime VP | Cprime DP | Cprime NP
    Cprime -> C """ + '\n'


    rule_test = """S ->  DP Period | VP Period
    DP -> Dprime | Dprime QP | Dprime AP 
    Dprime -> D NP | NP
    NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N | N PP | PP N | N CP PP | PP CP N 
    PP -> Pprime | Pprime ADV | Pprime VP
    Pprime -> P | P DP
    AP -> Aprime | Aprime ADV
    Aprime -> A | A DP
    VP -> Vprime | Vprime ADV | Vprime DP
    Vprime -> V | V DP | V PRN | Vprime CP 
    CP -> Cprime | Cprime VP
    Cprime -> C | C VP | C NP """ + '\n'

    rule_test_b = """S -> DP VP 
    DP ->  Dprime QP | Dprime AP   
    Dprime -> D NP
    PP ->   Pprime ADV | Pprime VP 
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP ->  Vprime ADV | Vprime DP 
    Vprime -> V DP | V PRN | V CP 
    NP ->  Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N 
    CP -> Cprime VP 
    Cprime -> C VP | C NP """ + '\n'

    rule_abc = """S ->  DP Period 
    DP -> Dprime QP | Dprime AP 
    Dprime -> D NP
    NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N | N CP PP | PP CP N 
    PP -> Pprime ADV | Pprime VP
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP -> Vprime ADV | Vprime DP
    Vprime -> V DP | V PRN | Vprime CP 
    CP -> Cprime VP
    Cprime -> C VP | C NP """ + '\n'

    rule_test_b = """S -> DP VP 
    DP ->  Dprime QP | Dprime AP   
    Dprime -> D NP
    PP ->   Pprime ADV | Pprime VP 
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP ->  Vprime ADV | Vprime DP 
    Vprime -> V DP | V PRN | V CP 
    NP ->  Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N 
    CP -> Cprime VP 
    Cprime -> C VP | C NP """ + '\n'



    rule = """S ->  NP VP Sym | VP NP Sym |  VP Comma NP | NP Comma VP
    DP -> Dprime QP | Dprime AP 
    Dprime -> D NP
    PP -> Pprime ADV | Pprime TP
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP -> Vprime ADV | Vprime DP
    Vprime -> V DP | V PRN | Vprime CP | V comma DP | V comma PRN | comma Vprime CP
    NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N | N Comma PP | PP Comma N | N CP PP | PP CP N 
    TP -> Tprime DP | Tprime Q
    Tprime -> Tum VP | Tin VP
    Tprime -> Tma AP
    Tprime -> Tna- PP
    Tprime -> Tmay VP
    Tprime -> Ttaga VP
    CP -> Cprime TP
    Cprime -> C TP | C NP | comma C TP | comma C NP""" + '\n'

    rule_backup = """S ->  NP VP | VP NP
    DP -> Dprime QP | Dprime AP 
    Dprime -> D NP
    PP -> Pprime ADV | Pprime TP
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP -> Vprime ADV | Vprime DP
    Vprime -> V DP | V PRN | Vprime CP 
    NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N | N CP PP | PP CP N 
    TP -> Tprime DP | Tprime Q
    Tprime -> Tum VP | Tin VP
    Tprime -> Tma AP
    Tprime -> Tna- PP
    Tprime -> Tmay VP
    Tprime -> Ttaga VP
    CP -> Cprime TP
    Cprime -> C TP | C NP """ + '\n'

    i_tag = 0
    tag_rule = []
    sentence_word_tag = ''
    #print('tag length: ', len(tag))
    while i_tag < len(tag):
        if "NN" in tag[i_tag]:
            tag_rule.append('N')
        elif "PR" in tag[i_tag]:
            tag_rule.append('N')
        elif "DT" in tag[i_tag]:
            tag_rule.append('D')
        elif "LM" in tag[i_tag]:
            tag_rule.append('C')
        elif "CCU" in tag[i_tag]:
            tag_rule.append('P')
        elif "CC" in tag[i_tag]:
            tag_rule.append('C')
        elif "VB" in tag[i_tag]:
            tag_rule.append('V')
        elif "JJ" in tag[i_tag]:
            tag_rule.append('A')
        elif "RB" in tag[i_tag]:
            tag_rule.append('ADV')
        elif "CD" in tag[i_tag]:
            tag_rule.append('Q')
        elif "TS" in tag[i_tag]:
            tag_rule.append('D')
        elif "FW" in tag[i_tag]:
            tag_rule.append('N')
        elif "PMP" in tag[i_tag]:
            tag_rule.append('Period')
        elif "PMC" in tag[i_tag]:
            tag_rule.append('C')
        elif "PM" in tag[i_tag]:
            tag_rule.append('Sym')

        i_word = 0
        word_repeated = False
        while i_word < i_tag:
            if word[i_tag] == word[i_word]:
                word_repeated = True
            i_word += 1
        #print('i_tag: ', i_tag)
        if not word_repeated:
            sentence_word_tag += tag_rule[i_tag] + " -> " + "'" + word[i_tag] + "'" + '\n'
        i_tag += 1

    # DP = D' + QP | D' + AP
    # D' = D + NP
    #
    # PP = P' + ADV | P' + TP
    # P' = P + DP
    #
    # AP = A' + ADV
    # A' = A + DP
    #
    # VP = V' + ADV | V' + DP
    # V' = V + DP ¦ V + PRN ¦ V' + CP
    #
    # NP = N' + attribute phrase
    # N' = N + PP

    sentence_split = sentence.split()
    grammar = CFG.fromstring(rule_perphrase_c + sentence_word_tag)

    # #test uncomment to test english structure
    # grammar = CFG.fromstring("""
    # S -> NP VP
    # PP -> P NP
    # NP -> 'the' N | N PP | 'the' N PP
    # VP -> V NP | V PP | V NP PP
    # N -> 'cat'
    # N -> 'dog'
    # N -> 'rug'
    # V -> 'chased'
    # V -> 'sat'
    # P -> 'in'
    # P -> 'on'""")
    # sentence_split = 'the cat chased the dog on the rug'.split()

    rd = RecursiveDescentParser(grammar)
    sr = ShiftReduceParser(grammar)
    chart_parser = nltk.ChartParser(grammar)


    earley_chart_parser = nltk.EarleyChartParser(grammar)

    chart_parser = earley_chart_parser
    print(tag_rule)
    parse_tree = []
    print('Parse')
    for tree in chart_parser.parse(sentence_split):
        parse_tree.append(tree)

    if len(parse_tree) > 0:
        print(parse_tree[0])
    else:
        print('NO TREE')
Example #11
0
PP -> P NP

PropN -> 'Bill' | 'Bob' | 'He'
Det -> 'the' | 'a' | 'an' | 'An' | 'The' | 'A' | 'on'| 'some' 
N -> 'bear' | 'squirrel' | 'park' | 'block' | 'table' | 'river' | 'dog' | 'dogs'| 'pasta' | 'anchovies' | 'restaurant' | 'fork' 
Adj -> 'angry' | 'frightened' | 'furry' 
V -> 'chased' | 'saw' | 'eats' | 'eat' | 'chase' | 'Put' | 'have' 
P -> 'on' | 'in' | 'along' | 'with' 

""")

##sentence1 = "He eats pasta with a fork in the restaurant".split()
##parser1 = nltk.ChartParser(grammar)
##for tree1 in parser1.parse(sentence1):
##    # print(tree1)
##     print (tree1.draw())

sr = ShiftReduceParser(grammar)
sentence1 = "He eats pasta with some anchovies in the restaurant"
tokens = nltk.word_tokenize(sentence1)
for x in sr.parse(tokens):
    print(x.draw())

print("-------------------------------------------------------------------")

sentence1 = "He eats pasta with some anchovies in the restaurant".split()
parser1 = nltk.EarleyChartParser(grammar, trace=2)

for tree1 in parser1.parse(sentence1):
    print(tree1)
Example #12
0
def example1(s1):
    grammar1 = nltk.data.load('635/syn01.cfg')
    ep = nltk.EarleyChartParser(grammar1)
    for tree in ep.parse(s1.split()):
        tree.draw()