Ejemplo n.º 1
0
def generate_events_grammar(attribute, parent, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ART'))),
        Production(Nonterminal('ATTR'), (attribute, Nonterminal('END'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if parent is not None:
        art = Production(Nonterminal('ART'), ('the', Nonterminal('PAR')))
        par = Production(Nonterminal('PAR'), (parent, Nonterminal('V2')))
    else:
        art = Production(Nonterminal('ART'), ('the', Nonterminal('PAR')))
        par = Production(
            Nonterminal('PAR'),
            ('events that caused the incident', Nonterminal('V2')))
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('included', Nonterminal('ATTR')))
    else:
        v2 = Production(Nonterminal('V2'),
                        ('did not include', Nonterminal('ATTR')))
    gr.append(art)
    gr.append(par)
    gr.append(v2)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
Ejemplo n.º 2
0
def generate_sources_grammar(attribute, parent, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ART'))),
        Production(Nonterminal('ATTR'), (attribute, Nonterminal('END'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('included', Nonterminal('ATTR')))
    else:
        v2 = Production(Nonterminal('V2'),
                        ('didn´t include', Nonterminal('ATTR')))
    if parent is None:
        article = Production(Nonterminal('ART'), ('the', Nonterminal('CLS')))
        parent = Production(Nonterminal('CLS'), ('sources', Nonterminal('V2')))
    else:
        article = Production(Nonterminal('ART'), ('the', Nonterminal('PAR')))
        parent = Production(Nonterminal('PAR'), (parent, Nonterminal('V2')))
    gr.append(v2)
    gr.append(article)
    gr.append(parent)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
def train():
    print("Collecting sub-corpus from Penn Treebank (nltk.corpus)")
    
    # prepare parsing trees, extrated from treebank
    tbank_trees = []
    for sent in treebank.parsed_sents():
        sent.chomsky_normal_form()
        tbank_trees.append(sent)
    
    # build vocabulary list, extracted from treebank
    vocab_size = 10000 # set vocabulary size to 10000
    words = [wrd.lower() for wrd in treebank.words()]
    vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)]
    
    # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency
    tbank_productions = set(production for tree in tbank_trees for production in tree.productions())
    tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))
    production_rules = tbank_grammar.productions()
    rules_to_prob = defaultdict(int)
    nonterm_occurrence = defaultdict(int)
    
    #calculate probablity for rules
    for sent in tbank_trees:
        for production in sent.productions():
            if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal):
                production = Production(production.lhs(), [production.rhs()[0].lower()])
            nonterm_occurrence[production.lhs()] += 1
            rules_to_prob[production] += 1
    for rule in rules_to_prob:
        rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()]

    # use Katz smoothing
    rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab)
    rules = list(rules_to_prob.keys())
    rules_reverse_dict = dict((j,i) for i, j in enumerate(rules))
    left_rules = defaultdict(set)
    right_rules = defaultdict(set)
    unary_rules = defaultdict(set)
    
    # classify left, right rules
    for rule in rules:
        if len(rule.rhs()) > 1:
            left_rules[rule.rhs()[0]].add(rule)
            right_rules[rule.rhs()[1]].add(rule)
        else:
            unary_rules[rule.rhs()[0]].add(rule)
    terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str))
    terminal_nonterms = defaultdict(int)
    for rule in terminal_nonterms_rules:
        terminal_nonterms[rule.lhs()] += 1
        pcfg_parser = {
    'vocab': vocab,
        'left_rules': left_rules,
        'right_rules': right_rules,
        'unary_rules': unary_rules,
        'rules_to_prob': rules_to_prob,
        'terminal_nonterms': terminal_nonterms
    }
    return pcfg_parser
Ejemplo n.º 4
0
def generate_impacts_grammar(attribute, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ART'))),
        Production(Nonterminal('ART'),
                   ('the impact of the incident', Nonterminal('V2'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('was', Nonterminal('ATTR')))
    else:
        v2 = Production(Nonterminal('V2'), ('was not', Nonterminal('ATTR')))
    attribute = Production(Nonterminal('ATTR'),
                           (attribute, Nonterminal('END')))
    gr.append(v2)
    gr.append(attribute)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
Ejemplo n.º 5
0
def generate_entities_grammar(attribute, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ATTR'))),
        Production(Nonterminal('V3'), ('impacted', Nonterminal('OBJ'))),
        Production(Nonterminal('V3'), ('affected', Nonterminal('OBJ'))),
        Production(Nonterminal('OBJ'),
                   ('by the incident', Nonterminal('END'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('are', Nonterminal('V3')))
    else:
        v2 = Production(Nonterminal('V2'), ('are not', Nonterminal('V3')))
    attribute = Production(Nonterminal('ATTR'), (attribute, Nonterminal('V2')))
    gr.append(v2)
    gr.append(attribute)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
Ejemplo n.º 6
0
def add_production(g, prod):
    """Add production to a grammar

    Parameters
    ----------
    g : nltk.CFG

    prod : nltk.Production

    Returns
    -------
    nltk.CFG

    See Also
    --------
    nltk.CFG, nltk.Production
    """
    prods = list(g.productions())
    prods.append(prod)

    return CFG(prods)
Ejemplo n.º 7
0
def remove_nonterminal(g, nont):
    """Remove nonterminal from a grammar

    Parameters
    ----------
    g : nltk.CFG

    nont : nltk.Nonterminal

    Returns
    -------
    nltk.CFG

    See Also
    --------
    nltk.CFG, nltk.Nonterminal
    """
    prods = [
        p for p in g.productions() if p.lhs() != nont and nont not in p.rhs()
    ]

    return CFG(prods)
Ejemplo n.º 8
0
def remove_production(g, prod):
    """Remove production from a grammar

    Parameters
    ----------
    g : nltk.CFG

    prod : nltk.Production

    Returns
    -------
    nltk.CFG

    See Also
    --------
    nltk.CFG, nltk.Production
    """
    if len(g.productions(prod.lhs())) == 1:
        return remove_nonterminal(g, prod.lhs())

    prods = [p for p in g.productions() if p != prod]

    return CFG(prods)
Ejemplo n.º 9
0
def create_templates():
    """Creates the templates from the grammar."""

    prods = [
        # Specific verb with goal and the rest of instruction body.
        Production(Nonterminal('S'),
                   (Nonterminal('V2'), Nonterminal('V2_BODY'))),
        # A verb and rest of the instruction body assuming goal already mentioned.
        Production(Nonterminal('V2_BODY'),
                   (Nonterminal('V1'), Nonterminal('M_G_ALREADY_V'))),
        # A verb and the rest of the instruction body assuming the goal wasn't
        # mentioned before.
        Production(Nonterminal('S'),
                   (Nonterminal('V1'), Nonterminal('NO_GOAL'))),
        # The goal in the begining and the rest of the instruction body assuming
        # goal already mentioned.
        Production(Nonterminal('S'),
                   (Nonterminal('V1_GOAL'), Nonterminal('WITH_GOAL'))),
        # A verb and 'to the' and then goal mention and the rest of the instruction
        # body.
        Production(Nonterminal('V1_GOAL'),
                   (Nonterminal('V1'), Nonterminal('V1_CON'))),
        # A goal mention and the rest of the instruction body.
        Production(Nonterminal('WITH_GOAL'),
                   (Nonterminal('GOAL'), Nonterminal('M_G'))),
        # Main part of the instruction without verb in begining and resuming
        # sentence.
        Production(
            Nonterminal('M_G_ALREADY_V'),
            (Nonterminal('MAIN_NO_V'), Nonterminal('END_NEAR_GOAL_KNOWN'))),
        # # Main part of the instruction, adding a new sentence.
        Production(Nonterminal('M_G'),
                   (Nonterminal('MAIN'), Nonterminal('END_NEAR_GOAL_KNOWN'))),
        # End part - (1) near pivot assuming goal already mentioned; and (2) avoid
        # sentence.
        Production(Nonterminal('END_NEAR_GOAL_KNOWN'),
                   (Nonterminal('NEAR_GOAL_START'), Nonterminal('AVOID'))),
        # End part - (1) near pivot assuming goal not mentioned yet; and (2) avoid
        # sentence.
        Production(Nonterminal('END_NEAR_GOAL_KNOWN'),
                   (Nonterminal('NEAR_GOAL_END'), Nonterminal('AVOID'))),
        # Main part of the instruction without verb in begining and resuming
        # sentence assuming no goal mentioned before.
        Production(
            Nonterminal('NO_GOAL'),
            (Nonterminal('MAIN_NO_V'), Nonterminal('END_NEAR_GOAL_UNKNOWN'))),
        # Add Goal to main part and then resume instruction by adding an
        # ending(near+avoid).
        Production(
            Nonterminal('END_NEAR_GOAL_UNKNOWN'),
            (Nonterminal('GOAL_END'), Nonterminal('END_NEAR_GOAL_KNOWN'))),
        # Add Goal with near and then add an avoid sentenece.
        Production(Nonterminal('END_NEAR_GOAL_UNKNOWN'),
                   (Nonterminal('NEAR_GOAL_END'), Nonterminal('AVOID'))),
        # Termial for IN+DT after verb.
        Production(Nonterminal('V1_CON'), ('to the', )),
    ]

    prods += add_rules('V2', V2)
    prods += add_rules('AVOID', AVOID)
    prods += add_rules('NEAR_GOAL_START', NEAR_GOAL_START)
    prods += add_rules('NEAR_GOAL_END', NEAR_GOAL_END)
    prods += add_rules('GOAL', GOAL)
    prods += add_rules('GOAL_END', GOAL_END)
    prods += add_rules('MAIN_NO_V', MAIN_NO_V)
    prods += add_rules('MAIN', MAIN)
    prods += add_rules('V1', V1)

    grammar = CFG(Nonterminal('S'), prods)

    # Generate templates.
    templates = []
    for sentence in nltk.parse.generate.generate(grammar):

        sentence = ' '.join(sentence)

        if sentence[-1] != '.':
            sentence += '.'
        sentence = sentence.replace(" .", ".")
        sentence = sentence.replace(" ,", ",")
        sentence = sentence.replace("..", ".")

        re_space = re.compile(r'[\s]+')
        sentence = re_space.sub(r' ', sentence)

        templates.append(sentence)

    templates_df = pd.DataFrame(templates,
                                columns=['sentence']).drop_duplicates()
    # Save templates
    templates_df.to_csv('templates.csv', index=False, header=False)

    # Flag features.
    for column in STREET_FEATURES:
        templates_df[column] = templates_df['sentence'].apply(
            lambda x: column.upper() in x)

    return templates_df
Ejemplo n.º 10
0
    if (len(rule) == 2):
        newrules.append(rule)
        allrules.remove(rule)

# In[11]:

print(len(newrules))
print(len(allrules))
# print(newrules)
print(allrules)

# In[24]:

# Create CNF Grammar

cnf_ = CFG(start=Nonterminal('SIGMA'), productions=newrules)

# In[25]:

f = open('cnf_grammar.pkl', 'wb')
pickle.dump(cnf_, f)
f.close()

# In[26]:

#  Check CNF

print(cnf_.is_chomsky_normal_form())

# In[27]: