def generate_phrase(self, pool):
     try:
         adj = choice(list(pool.adjectives))
         parser = ChartParser(self.grammar)
         gr = parser.grammar()
         phrase = self.produce(gr, gr.start())
         #adj = choice(list(pool.adjectives))
         noun = choice(list(pool.comparisons[adj]))
         if en.noun.plural(noun.name) == noun.name:
             article = "the"
         else:
             article = en.noun.article(noun.name).split(" ")[0]
         replace_words = {'adj': adj, 'n': noun, 'det': article}
         for pos in replace_words:
             while pos in phrase:
                 try:
                     phrase = self.replace_pos(pos, replace_words[pos],
                                               phrase)
                 except:
                     return
         for w in phrase:
             if not isinstance(w, Word):
                 phrase[phrase.index(w)] = Word(w)
         return phrase
     except:
         return
 def generate_phrase(self, pool):
     parser = ChartParser(self.grammar)
     gr = parser.grammar()
     phrase = self.produce(gr, gr.start())
     noun = choice(list(pool.nouns))
     try:
         replace_words = {
             'n': [noun],
             'v':
             [Word(self.conjugate(v.name)) for v in list(pool.verbs[noun])],
             'adj': pool.epithets[noun],
             'atv':
             [Word(self.conjugate(v, self.person)) for v in self.atv],
             'eva':
             [Word(self.conjugate(v, self.person)) for v in self.eva],
             'ej': pool.emotional_adjectives,
             'en': pool.emotional_nouns,
             'erb': pool.emotional_adverbs,
             'person': [Word(self.persons[self.person][0])],
             'pron': [Word(self.persons[self.person][1])]
         }
     except:
         return
     for pos in replace_words:
         while pos in phrase:
             try:
                 word = choice(replace_words[pos])
                 phrase = self.replace_pos(pos, word, phrase)
             except:
                 return
     for w in phrase:
         if not isinstance(w, Word):
             phrase[phrase.index(w)] = Word(w)
     return phrase
Ejemplo n.º 3
0
def execute(text: str):
    groucho_grammer = CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
    """)
    parser = ChartParser(groucho_grammer)

    tokens = word_tokenize(text=SAMPLE_3)
    print(type(tokens))
    print(tokens)
    for tree in parser.parse(tokens=[
            'The',
            'little',
            'bear',
            'saw',
            'the',
            'fine',
            'fat',
            'trout',
            'in',
            'the',
            'brook',
    ]):
        print(tree)
Ejemplo n.º 4
0
def context_free_grammar():
    cfg = CFG.fromstring("""\
    ################# Rules #################
    S -> NP VP
    S -> PP NP VP
    S -> Wh Aux NP VP 
    NP -> ProperNoun | CC ProperNoun | N | ProperNoun NP | AP N | DET NP | N PP    
    VP -> V | V NP | Adv VP | V NP VP
    AP -> Adj | Adj AP
    PP -> P NP | P NP VP
    
    ################# Lexicons ################# 
    N -> 'milk'| 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table'
    V -> 'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear'
    ProperNoun -> 'Bart' | 'Homer' | 'Lisa'
    Aux -> 'do' | 'does'
    CC -> 'and'
    Adj -> 'blue' | 'healthy' | 'green' 
    DET -> 'a' | 'the' 
    Adv -> 'always' | 'never' 
    P -> 'in' | 'before' | 'on' | 'when'
    Wh -> 'when'
    """)
    cfparser = ChartParser(cfg)
    sents = text.splitlines()
    for sent in sents:
        parses = cfparser.parse(sent.split())
        print(sent)
        for tree in parses:
            print(tree)
 def generate_phrase(self, pool):
     parser = ChartParser(self.grammar)
     gr = parser.grammar()
     phrase = self.produce(gr, gr.start())
     noun = choice(list(pool.nouns))
     try:
         replace_words = {'n':[noun], 'v': [Word(self.conjugate(v.name)) for v in list(pool.verbs[noun])], 
                          'adj': pool.epithets[noun],
                          'atv':[Word(self.conjugate(v, self.person)) for v in self.atv],
                          'eva':[Word(self.conjugate(v, self.person)) for v in self.eva],
                          'ej': pool.emotional_adjectives,'en':pool.emotional_nouns,
                          'erb': pool.emotional_adverbs, 'person':[Word(self.persons[self.person][0])], 
                          'pron':[Word(self.persons[self.person][1])]}
     except:
         return
     for pos in replace_words:
         while pos in phrase:
             try:
                 word = choice(replace_words[pos])
                 phrase = self.replace_pos(pos,word,phrase)
             except:
                 return
     for w in phrase:
         if not isinstance(w, Word):
             phrase[phrase.index(w)] = Word(w)
     return phrase
Ejemplo n.º 6
0
def parse_original_sentences(grammar):
    '''
    Uses given grammar to parse sentences from the file corpus.txt
    Writes the parse trees of each sentence in parsed_corpus.txt
    :param grammar: A context free grammar in the form of nltk.grammar.CFG
    :return: None (Output in parsed_corpus.txt)
    '''
    parser = ChartParser(grammar)
    f = open("corpus.txt", "r")
    f_write = open("parsed_corpus.txt", "w")
    lines = f.readlines()
    count = 1
    working = []
    for line in lines:
        line = line.replace("didnt", "did not")
        s = "Tree {}:\n".format(count)
        sent = word_tokenize(line[:-2])
        for tree in parser.parse(sent):
            s += str(tree) + "\n\n"
            working.append(count)
            break
        count += 1
        f_write.write(s)

    f.close()
    f_write.close()
    print(
        "Parsed form of original corpus sentences using this CFG can be found in parsed_corpus.txt\n"
    )
Ejemplo n.º 7
0
    def __init__(self, grammar):
        """
        Initialize from a CFG.

        :type grammar: CFG
        :param grammar: The grammar for this oracle
        """
        self._parser = ChartParser(grammar)
Ejemplo n.º 8
0
def parse_sentences(grammar):
    parser = ChartParser(grammar)
    sent = input("Parse a sentence (Q to quit): ")
    while sent != "Q":
        tokens = word_tokenize(sent)
        trees = parser.parse(tokens)
        print_trees(trees)
        sent = input("Parse a sentence (Q to quit): ")
Ejemplo n.º 9
0
def generate_name(G):
    grammar = CFG.fromstring(G)

    parser = ChartParser(grammar)

    gr = parser.grammar()
    tokens = produce(gr, gr.start())
    name = ''.join(tokens)
    return name.title()
Ejemplo n.º 10
0
def generate_parse_tree(sentence, grammar):
    # then generate the parse trees
    tokens = word_tokenize(sentence)
    parser = ChartParser(grammar)
    # print type(grammar), type(parser)
    try:
        return parser.parse(tokens)
    except Exception:
        #print "Sentence '" + sentence + "' cannot be parsed using the given grammar."
        return Tree('Error', ['Error'])
Ejemplo n.º 11
0
def generate_impacts_question(attr, impacts, phase):
    impact = get_attribute_name(attr, impacts)
    parser = ChartParser(generate_impacts_grammar(impact, phase))
    gr = parser.grammar()
    question = {
        'text': ' '.join(produce(gr, gr.start())),
        'answer': 0,
        'questionId': 0,
        'attrId': attr,
        'topicId': 4
    }
    return question
Ejemplo n.º 12
0
def generate_entities_question(attr, entities, phase):
    entity = get_attribute_name(attr, entities)
    parser = ChartParser(generate_entities_grammar(entity, phase))
    gr = parser.grammar()
    question = {
        'text': ' '.join(produce(gr, gr.start())),
        'answer': 0,
        'questionId': 0,
        'attrId': attr,
        'topicId': 3
    }
    return question
Ejemplo n.º 13
0
def get_productions(sentence, grammar):
    trees = []
    sent = sentence.split(' ')
    print sent
    cfgGrammar = CFG.fromstring(grammar)

    parser = ChartParser(cfgGrammar)
    for tree in parser.parse(sent):
        trees.append(str(tree).replace("\n", " "))

    # print trees[0]
    t = Tree.fromstring(trees[0])
    return t.productions()
Ejemplo n.º 14
0
class GrammarOracle(Oracle):
    """
    An oracle from a grammar.
    """
    def __init__(self, grammar):
        """
        Initialize from a CFG.

        :type grammar: CFG
        :param grammar: The grammar for this oracle
        """
        self._parser = ChartParser(grammar)

    def generates(self, sentence):
        """
        Decides whether the grammar generates the sentence.

        :type sentence: Sentence
        :param sentence: A sentence

        :rtype: bool
        :return: Whether the grammar generates the sentence
        """
        try:
            parses = self._parser.parse(sentence.get_words())
            return list(parses) != []
        except:
            return False
Ejemplo n.º 15
0
def generate_sources_question(attr, parent_attr, sources, phase):
    id = attr
    attribute = get_attribute_name(attr, sources)
    attribute = analyze_numerus(attribute)
    if parent_attr is not None:
        parent_attr = get_attribute_name(parent_attr, sources)
    parser = ChartParser(
        generate_sources_grammar(attribute, parent_attr, phase))
    gr = parser.grammar()
    question = {
        'text': ' '.join(produce(gr, gr.start())),
        'answer': 0,
        'questionId': 0,
        'attrId': id,
        'topicId': 1
    }
    return question
Ejemplo n.º 16
0
def recognizes(cfg, word):
    """
    cfg : a nltk.grammar.CFG instance
    word : a string with tokens separated with spaces.

    A parser is created at every call of this function.
    """
    return _recognizes(ChartParser(cfg), word.split())
Ejemplo n.º 17
0
def parse_blazon(blazon):
    blazon = blazon.lower()
    to_discard = set(string.punctuation)
    to_discard.remove("&")
    blazon = ''.join(c for c in blazon if c not in to_discard)
    # Convert raw data to tokens to be parsed
    tokens = word_tokenize(blazon)

    # Replace instances of '1st', '2nd', etc with their non abbreviated forms
    for (index, item) in enumerate(tokens):
        if (item in abbr_to_full):
            tokens[index] = abbr_to_full[item]
        elif (item == "&"):
            tokens[index] = "and"

    # Sanitise tokens
    tokens = disambiguate_colours(tokens)
    tokens = reorder(tokens)

    # Construct grammar and parser
    with open('app/parser_cfg.txt') as f:
        raw_cfg = f.read()

    parser_grammar = CFG.fromstring(raw_cfg)
    parser = ChartParser(parser_grammar)

    # Parse data into tree
    output_data = None
    for tree in parser.parse(tokens):
        output_data = tree

    if (output_data is None):
        print("Error: Parse failed, please check input is of correct format.")
    else:
        # Convert Tree to dict to prepare it for JSON serialisation
        output_data = tree_to_dict(output_data)
        # If a tincture is in the top level of the dictionary, change its name to "field"
        if ("tincture" in output_data.keys()):
            output_data["field"] = output_data["tincture"]
            output_data.pop("tincture")
        # Convert dict to JSON
        return (output_data)
Ejemplo n.º 18
0
    def verify(self, grammar, tags):
        """ Verify tag sequence as grammatically correct or not """
        # rd_parser = RecursiveDescentParser(grammar)
        rd_parser = ChartParser(grammar)
        valid = False

        try:
            for tree in rd_parser.parse(tags):
                valid = True
                break
        except ValueError:
            print "This is a grammatical structure I don't understand yet."
            return

        if valid:
            print "Valid"
            return True
        else:
            print "Invalid"
            return False
Ejemplo n.º 19
0
def accepted_length(cfg, x):
    """
    Returns a list of every accepted word of a context-free grammar with a specific length
    """
    terminals = _get_terminal_symbols(cfg)
    parser = ChartParser(cfg)
    accepted = []
    for y in product(terminals, repeat=x):
        if _recognizes(parser, y):
            accepted.append(' '.join(y))
    return accepted
Ejemplo n.º 20
0
    def generate_phrase(self):
        adj = choice([a for a in self.blackboard.pool.comparisons if len(self.blackboard.pool.comparisons[a]) > 0])
        parser = ChartParser(self.grammar)
        gr = parser.grammar()
        phrase = self.produce(gr, gr.start())
        noun = choice(list(self.blackboard.pool.comparisons[adj]))
        noun.name = en.singularize(noun.name)
        article = en.referenced(noun.name).split(" ")[0]
        replace_words = {'adj': adj, 'n': noun, 'det': article}

        for pos in replace_words:
            while pos in phrase:
                try:
                    phrase = self.replace_pos(
                        pos, replace_words[pos], phrase)
                except:
                    return
        for w in phrase:
            if not isinstance(w, Word):
                phrase[phrase.index(w)] = Word(w)
        return phrase
Ejemplo n.º 21
0
def recognizesAll(cfg, words):
    """
    Returns a list of boolean values corresponding to [recognizes(cfg,w) for w in words].
    cfg : a nltk.grammar.CFG instance
    words must be a list of string with tokens separated with spaces.

    """
    r = []
    parser = ChartParser(cfg)
    for word in words:
        r.append(_recognizes(parser, word.split()))
    return r
Ejemplo n.º 22
0
    def generate_phrase(self, pool):
        noun = random.choice(list(pool.nouns))
        parser = ChartParser(self.grammar)
        gr = parser.grammar()
        phrase = self.produce(gr, gr.start())
        phrase.append("?")

        try:
            adj = choice(pool.epithets[noun])
        except:
            return
        replace_words = {'adj': adj, 'n': noun, 'be': self.conjugate("be")}
        for pos in replace_words:
            while pos in phrase:
                try:
                    phrase = self.replace_pos(pos, replace_words[pos], phrase)
                except:
                    return
        for w in phrase:
            if not isinstance(w, Word):
                phrase[phrase.index(w)] = Word(w)
        return phrase
 def generate_phrase(self, pool):
     noun = random.choice(list(pool.nouns))
     parser = ChartParser(self.grammar)
     gr = parser.grammar()
     phrase = self.produce(gr, gr.start())
     phrase.append("?")
    
     try:
         adj = choice(pool.epithets[noun])
     except:
         return
     replace_words = {'adj':adj, 'n': noun, 'be': self.conjugate("be")}
     for pos in replace_words:
         while pos in phrase:
             try:
                 phrase = self.replace_pos(pos,replace_words[pos],phrase)
             except:
                 return
     for w in phrase:
         if not isinstance(w, Word):
             phrase[phrase.index(w)] = Word(w)
     return phrase
 def generate_phrase(self, pool):
     parser = ChartParser(self.grammar)
     gr = parser.grammar()
     phrase = self.produce(gr, gr.start())
     noun = random.choice(list(pool.nouns))
     adj = choice(pool.epithets[noun])
     replace_words = {
         "adj": adj,
         "n": noun,
         "be": self.conjugate("be", self.person),
         "person": self.persons[self.person][0],
     }
     for pos in replace_words:
         while pos in phrase:
             try:
                 phrase = self.replace_pos(pos, replace_words[pos], phrase)
             except:
                 return
     for w in phrase:
         if not isinstance(w, Word):
             phrase[phrase.index(w)] = Word(w)
     return phrase
Ejemplo n.º 25
0
def accepted_under(cfg, length):
    """
    Returns a list of every accepted word of a context-free grammar under a given length.
    cfg : a nltk.grammar.CFG instance. 
    """
    terminals = _get_terminal_symbols(cfg)

    parser = ChartParser(cfg)
    accepted = []
    for x in range(1, length):
        for y in product(terminals, repeat=x):
            if _recognizes(parser, y):
                accepted.append(' '.join(y))
    return accepted
Ejemplo n.º 26
0
def main():
    cfparser = ChartParser(cfg)
    index = 0
    for sent in text:
        index += 1
        print_tree(sent, cfparser, index)
    print "Input testing sentece or the number of the above one: (q to quit)"
    str = sys.stdin.readline().strip()
    while str != "q":
        try:
            index = int(str)
            print_tree(text[index], cfparser, index)
        except IndexError:
            print "Index out of range. Please check."
        except ValueError:
            print_tree(str, cfparser, -1)
        print "Input testing sentece or the number of the above one: (q to quit)"
        str = sys.stdin.readline().strip()
Ejemplo n.º 27
0
Nominal    ->    NOUN | Nominal PP | ADJ Nominal | Nominal NOUN
PP         ->    Prep NP
AdvC       ->    CONJ S
ProperNoun ->    'Bart' | 'Homer' | 'Lisa'
CONJ       ->    'and' | 'when'
ADV        ->    'always' | 'never'
V          ->    'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear'
DET        ->    'a' | 'the'
NOUN       ->    'milk' | 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table'
ADJ        ->    'blue' | 'healthy' | 'green'
Prep       ->    'in' | 'before' | 'on'
WH         ->    'when'
Aux        ->    'do' | 'does'
""")

cfparser = ChartParser(cfg)
text = """
Bart laughs
Homer laughed
Bart and Lisa drink milk
Bart wears blue shoes
Lisa serves Bart a healthy green salad
Homer serves Lisa
Bart always drinks milk
Lisa thinks Homer thinks Bart drinks milk
Homer never drinks milk in the kitchen before midnight
when Homer drinks milk Bart laughs
when does Lisa drinks the milk on the table
when do Lisa and Bart wear shoes
"""
Ejemplo n.º 28
0
import nltk
from nltk import ChartParser

# Load grammar.
grammar = nltk.data.load('labelgrammar.cfg')
parser = ChartParser(grammar)


def analyze_label(label):
    "Analyze a label using our CFG."
    tokenized_label = label.split()
    try:
        analysis = parser.parse(tokenized_label)
        trees = list(analysis)
        for tree in trees:
            print(tree)
        if len(trees) > 0:
            return analysis
        else:
            print('No analysis possible')
            return None

    except ValueError as e:
        print('No analysis possible:', e.strerror)
        return None
Ejemplo n.º 29
0
from nltk import data, ChartParser
from nltk import pos_tag
from nltk.corpus import inaugural

data.clear_cache()
G = data.load("file:mygrammar.cfg")
RDP = ChartParser(G)


# extract_short_sents :: Int?, Int?, Corpus?-> [[(String, String)]]
def extract_short_sents(num=8, max_len=8, corpus=inaugural):
    li = []
    num = num if num < len(corpus.fileids()) else len(corpus.fileids())

    for i in range(num):
        for sent in corpus.sents(corpus.fileids()[i]):
            if len(sent) <= max_len:
                li.append(pos_tag(sent))
                if len(li) / 3.0 == i:
                    break

    return li


# parse :: String -> ParseTree
def parse(s):
    return RDP.parse(s.split())


if __name__ == "__main__":
    sents = [
Ejemplo n.º 30
0
from nltk import CFG,ChartParser
from nltk.tokenize import SpaceTokenizer
grammar = CFG.fromstring("""
  S -> NP VP
  NP -> Det N
  VP -> IV
  Det -> 'the'
  N -> 'man'
  IV -> 'walks'
  """)
#>>> grammar
#<Grammar with 14 productions>
#>>> grammar.start()
#S
#>>> grammar.productions()
#[S -> NP VP, NP -> Det N, VP -> IV, Det -> 'the', N -> 'man', IV -> 'walks']
parser = ChartParser(grammar)
parses = parser.parse_all(SpaceTokenizer().tokenize("the man walks"))
#>>> parses
#[Tree('S', [Tree('NP', [Tree('Det', ['the']), Tree('N', ['man'])]), Tree('VP', [Tree('IV', ['walks'])])])]
Ejemplo n.º 31
0
 S -> LImports LRules
 LImports -> Import LImports | 
 Import -> '@import' '"string"' ';'
 LRules -> Rule LRules | 
 Rule -> Selectors '{' LDeclaretions '}'
 LDeclaretions -> Declaration ';' MoreDeclerations
 MoreDeclerations -> LDeclaretions | 
 Selectors -> SimpleSelector MoreSelectors
 MoreSelectors -> Selectors | 
 SimpleSelector -> Astrisk SelectorModifier
 Astrisk -> '*' | 
 SelectorModifier -> '.' 'name' | ':' 'name' | '[' 'name' '=' Term ']' | '#hashid' | 'name'
 Declaration -> 'name' ':' LTerms Important
 Important -> '!ImPoRtAnT' | 
 LTerms -> Term MoreTerms
 MoreTerms -> LTerms | 
 Term -> '1337' | '15%' | '"string"' | 'name' | '#hashid'
 """)

parser = ChartParser(grammar)
gr = parser.grammar()

test_name = "generated"

with open(test_name + '.in', 'w+') as writer:
    writer.write(' '.join(produce(gr, gr.start())))

with open(test_name + '.out', 'w+') as writer:
    writer.write("\n".join(map(str, rules)))
    writer.write("\nSuccess\n")
Ejemplo n.º 32
0
			else:
				words.extend(produce(grammar, sym, minlen))
	return words




grammar = parse_cfg('''
F -> N1 '(' P ')' | N2 '(' P ',' P ')'
N1 -> 'half'
N2 -> 'sum'
P -> 'a' | 'b' | F
''')

'''
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
V -> 'shot' | 'killed' | 'wounded'
Det -> 'an' | 'my' 
N -> 'elephant' | 'pajamas' | 'cat' | 'dog'
P -> 'in' | 'outside'
'''

parser = ChartParser(grammar)

gr = parser.grammar()
print ' '.join(produce(gr, gr.start(),3))

Ejemplo n.º 33
0
class GDev:
	# 1. We will create a grammer development tool.
	# Define a class caledd GDev. The __init__ method should take a 
	# name (a string) as input, and store it in the member name.
	def __init__( self, name ):
		self.name = name
		return

	# 2. Define a method called load_grammar. It takes no arguments.
	# It expects the file name.cfg to exist, where name is the GDev name.
	# It loads a grammer from the file and stores it in the member grammer.
	def load_grammar( self ):
		s = open( self.name + '.cfg' ).read()
		self.grammar = CFG.fromstring(s)
		return

	# 3. Define a method called reload. It should call the method 
	# load_grammar, even if the grammar has already been loaded before.
	# Then it should create a chart parser from the loaded grammar, and
	# store the parser in the member parser. 
	def reload( self ):
		self.load_sents()
		self.load_grammar()
		self.parser = ChartParser( self.grammar )
		return

	# 4. Define a method called parse. It should take one argument, a string.
	# It should call word_tokenize on the sentence, and pass the result to 
	# the parser. The parse method should return a single tree. If the parser
	# returns more than one tree, then parse should return just the first one.
	# If the parser does not return any tress, then parse should return None.
	def parse( self, s ):
		try:
			return list( self.parser.parse( word_tokenize( s ) ) )[0]
		except:
			return None

	# 5. Define a method called load_sents. It takes no arguments. It expects
	# the file name.sents to exist. The file should contain one sentence per 
	# line. Each sentence is either good or bad—good sentences are ones that 
	# the grammar ought to generate, and bad sentences are ones that the 
	# grammar should not generate. If the first character on the line is ’*’, 
	# the sentence is bad, and otherwise it is good. The load_sents method 
	# should produce a list of pairs (good, s) where good is True for good 
	# sentences and False for bad ones, and s is the sentence itself (not 
	# including the ’*’). The list of pairs should be stored in the member 
	# sents. Create a file g1.sents containing the sentences Bob warbled, the 
	# dog ate my telescope, and *Bob cat.
	def load_sents( self ):
		self.sents = [ ( True, line.rstrip('\r\n') ) \
						if line[0] != '*' \
						else (False, line.rstrip('\r\n')[1:]) \
						for line in open(self.name + '.sents') ]
		# print( self.sents )


	# 6. Define a method called parses. It should take no arguments. 
	# It should iterate through the pairs (g,s) in sents, and it should 
	# call parse on each sentence s in turn. For each sentence, it should
	# print an empty line, then the sentence, then the result of calling parse.
	def parses( self ):
		for s in self.sents:
			print( '\n' + s[1] )
			print( self.parse( s[1] ) )

	# 7. Write a method called regress that takes no arguments. It should go 
	# through the pairs (good, s) in sents. For each, it should call parse on s.
	# Define the prediction to be True if parse returns a tree, and False otherwise.
	# If the prediction equals good, then the prediction is correct, and otherwise
	# the prediction is wrong. For each pair, print out one line of output. The output
	# line should start with '!!' if the prediction is wrong and '  ' (two spaces) 
	# it is correct. Then print out a space. Then print '*' if good is False, and a 
	# space if good is True. The output line ends with the sentence s.
	def regress( self ):
		prediction = False
		for s in self.sents:
			if self.parse( s[1] ) is not None: 
				prediction = True
			else:
				prediction = False
			if prediction != s[0]:
				print( '!!' + ' ' , end = '')
			else:
				print( '  ' + ' ' , end = '')
			if s[0] == False:
				print( '*' , end = '')
			else:
				print( ' ' , end = '')
			print( s[1] )

	# 8. Finally, the __call__ method should simply call reload and regress.
	# The idea is to use the set of example sentences to drive grammar development.
	# One adds sentences, calls gd() to see which ones are being handled correctly
	# or not, and then one edits the grammar to fix the prediction errors. After
	# each file edit, one needs merely call gd() to see the revised grammar's 
	# predictions on the sentences. (Making sure that new revisions do not break 
	# things that previously worked correctly is known as regression testing.)
	def __call__( self ):
		self.reload()
		self.regress()
Ejemplo n.º 34
0
def parse_sentences(grammar, sent):
	parser = ChartParser(grammar)
	tokens = word_tokenize(sent)
	trees = parser.parse(tokens)
	return trees
Ejemplo n.º 35
0
import nltk
from nltk import parse_cfg, ChartParser
from random import choice

def produce(finalgrammar, symbol):
    words = []
    productions = finalgrammar.productions(lhs = symbol)
    production = choice(productions)
    for sym in production.rhs():
        if isinstance(sym, str):
            words.append(sym)
        else:
            words.extend(produce(finalgrammar, sym))
    return words

finalgrammar = nltk.data.load('file:grammarfinal.cfg',cache=False)

parser = ChartParser(finalgrammar)
gr = parser.finalgrammar()
print ' '.join(produce(gr,gr.start()))
Ejemplo n.º 36
0
class QueryParser(object):
    #PYPARSING preterminal definitions
    LBRACE = Suppress(Literal('('))
    RBRACE = Suppress(Literal(')'))
    WRD = Regex("[0-9a-zA-Z_\-\—\,\.\?\!\>\<\=\/\:\;\&\{\}\+]+")
    ABL = LBRACE + Suppress(Literal('ABL')) + WRD + RBRACE
    ABN = LBRACE + Suppress(Literal('ABN')) + WRD + RBRACE
    ABX = LBRACE + Suppress(Literal('ABX')) + WRD + RBRACE
    AP = LBRACE + Suppress(Literal('AP')) + WRD + RBRACE
    AT = LBRACE + Suppress(Literal('AT')) + WRD + RBRACE
    BE = LBRACE + Suppress(Literal('BE')) + WRD + RBRACE
    BED = LBRACE + Suppress(Literal('BED')) + WRD + RBRACE
    BEDZ = LBRACE + Suppress(Literal('BEDZ')) + WRD + RBRACE
    BEG = LBRACE + Suppress(Literal('BEG')) + WRD + RBRACE
    BEM = LBRACE + Suppress(Literal('BEM')) + WRD + RBRACE
    BEN = LBRACE + Suppress(Literal('BEN')) + WRD + RBRACE
    BER = LBRACE + Suppress(Literal('BER')) + WRD + RBRACE
    BEZ = LBRACE + Suppress(Literal('BEZ')) + WRD + RBRACE
    CC = LBRACE + Suppress(Literal('CC')) + WRD + RBRACE
    CD = LBRACE + Suppress(Literal('CD')) + WRD + RBRACE
    CS = LBRACE + Suppress(Literal('CS')) + WRD + RBRACE
    DO = LBRACE + Suppress(Literal('DO')) + WRD + RBRACE
    DOD = LBRACE + Suppress(Literal('DOD')) + WRD + RBRACE
    DOZ = LBRACE + Suppress(Literal('DOZ')) + WRD + RBRACE
    DT = LBRACE + Suppress(Literal('DT')) + WRD + RBRACE
    DTI = LBRACE + Suppress(Literal('DTI')) + WRD + RBRACE
    DTS = LBRACE + Suppress(Literal('DTS')) + WRD + RBRACE
    DTX = LBRACE + Suppress(Literal('DTX')) + WRD + RBRACE
    EX = LBRACE + Suppress(Literal('EX')) + WRD + RBRACE
    FW = LBRACE + Suppress(Literal('FW')) + WRD + RBRACE
    HL = LBRACE + Suppress(Literal('HL')) + WRD + RBRACE
    HV = LBRACE + Suppress(Literal('HV')) + WRD + RBRACE
    HVD = LBRACE + Suppress(Literal('HVD')) + WRD + RBRACE
    HVG = LBRACE + Suppress(Literal('HVG')) + WRD + RBRACE
    HVN = LBRACE + Suppress(Literal('HVN')) + WRD + RBRACE
    HVZ = LBRACE + Suppress(Literal('HVZ')) + WRD + RBRACE
    IN = LBRACE + Suppress(Literal('IN')) + WRD + RBRACE
    JJ = LBRACE + Suppress(Literal('JJ')) + WRD + RBRACE
    JJR = LBRACE + Suppress(Literal('JJR')) + WRD + RBRACE
    JJS = LBRACE + Suppress(Literal('JJS')) + WRD + RBRACE
    JJT = LBRACE + Suppress(Literal('JJT')) + WRD + RBRACE
    MD = LBRACE + Suppress(Literal('MD')) + WRD + RBRACE
    NC = LBRACE + Suppress(Literal('NC')) + WRD + RBRACE
    NN = LBRACE + Suppress(Literal('NN')) + WRD + RBRACE
    NNS = LBRACE + Suppress(Literal('NNS')) + WRD + RBRACE
    NP = LBRACE + Suppress(Literal('NP')) + WRD + RBRACE
    NPS = LBRACE + Suppress(Literal('NPS')) + WRD + RBRACE
    NR = LBRACE + Suppress(Literal('NR')) + WRD + RBRACE
    NRS = LBRACE + Suppress(Literal('NRS')) + WRD + RBRACE
    OD = LBRACE + Suppress(Literal('OD')) + WRD + RBRACE
    PN = LBRACE + Suppress(Literal('PN')) + WRD + RBRACE
    PPL = LBRACE + Suppress(Literal('PPL')) + WRD + RBRACE
    PPLS = LBRACE + Suppress(Literal('PPLS')) + WRD + RBRACE
    PPO = LBRACE + Suppress(Literal('PPO')) + WRD + RBRACE
    PPS = LBRACE + Suppress(Literal('PPS')) + WRD + RBRACE
    PPSS = LBRACE + Suppress(Literal('PPSS')) + WRD + RBRACE
    QL = LBRACE + Suppress(Literal('QL')) + WRD + RBRACE
    QLP = LBRACE + Suppress(Literal('QLP')) + WRD + RBRACE
    RB = LBRACE + Suppress(Literal('RB')) + WRD + RBRACE
    RBR = LBRACE + Suppress(Literal('RBR')) + WRD + RBRACE
    RBT = LBRACE + Suppress(Literal('RBT')) + WRD + RBRACE
    RN = LBRACE + Suppress(Literal('RN')) + WRD + RBRACE
    RP = LBRACE + Suppress(Literal('RP')) + WRD + RBRACE
    TL = LBRACE + Suppress(Literal('TL')) + WRD + RBRACE
    TO = LBRACE + Suppress(Literal('TO')) + WRD + RBRACE
    UH = LBRACE + Suppress(Literal('UH')) + WRD + RBRACE
    VB = LBRACE + Suppress(Literal('VB')) + WRD + RBRACE
    VBD = LBRACE + Suppress(Literal('VBD')) + WRD + RBRACE
    VBG = LBRACE + Suppress(Literal('VBG')) + WRD + RBRACE
    VBN = LBRACE + Suppress(Literal('VBN')) + WRD + RBRACE
    VBZ = LBRACE + Suppress(Literal('VBZ')) + WRD + RBRACE
    WDT = LBRACE + Suppress(Literal('WDT')) + WRD + RBRACE
    WPO = LBRACE + Suppress(Literal('WPO')) + WRD + RBRACE
    WPS = LBRACE + Suppress(Literal('WPS')) + WRD + RBRACE
    WQL = LBRACE + Suppress(Literal('WQL')) + WRD + RBRACE
    WRB = LBRACE + Suppress(Literal('WRB')) + WRD + RBRACE
    PRETERM = ABL ^ ABN ^ ABX ^ AP ^ AT ^ BE ^ BED ^ BEDZ ^ BEG ^ BEM ^ BEN ^ BER ^ BEZ ^ CC ^ CD ^ CS ^ DO ^ DOD ^ DOZ ^ DT ^ DTI ^ DTS ^ DTX ^ EX ^ FW ^ HL ^ HV ^ HVD ^ HVG ^ HVN ^ HVZ ^ IN ^ JJ ^ JJR ^ JJS ^ JJT ^ MD ^ NC ^ NN ^ NNS ^ NP ^ NPS ^ NR ^ NRS ^ OD ^ PN ^ PPL ^ PPLS ^ PPO ^ PPS ^ PPSS ^ QL ^ QLP ^ RB ^ RBR ^ RBT ^ RN ^ RP ^ TL ^ TO ^ UH ^ VB ^ VBD ^ VBG ^ VBN ^ VBZ ^ WDT ^ WPO ^ WPS ^ WQL ^ WRB
    UKWORD = Group(LBRACE + Literal('WORD') + PRETERM + RBRACE)

    #PYPARSING - DSL primary entity
    company = Group(LBRACE + Literal('company') + OneOrMore(WRD) + RBRACE)
    entity = Group(LBRACE + Literal('entity') + OneOrMore(WRD) + RBRACE)
    relation = LBRACE + Literal('relation') + OneOrMore(WRD) + RBRACE
    attribute = LBRACE + Literal('attribute') + OneOrMore(WRD) + RBRACE
    CASHFLOW = LBRACE + Literal('CASHFLOW') + OneOrMore(WRD) + RBRACE
    BALANCESHEET = LBRACE + Literal('BALANCESHEET') + OneOrMore(WRD) + RBRACE
    INCOMESTMT = LBRACE + Literal('INCOMESTMT') + OneOrMore(WRD) + RBRACE
    REPORT = Group(LBRACE + Suppress(Literal('REPORT')) + (CASHFLOW ^ BALANCESHEET ^ INCOMESTMT) + RBRACE)
    DATE = Group(LBRACE + Literal('DATE') + WRD + RBRACE)
    RELATION = LBRACE + Suppress(Literal('RELATION')) + relation + RBRACE
    ATTRIBUTE = LBRACE + Suppress(Literal('ATTRIBUTE')) + attribute + RBRACE
    COMPANY = LBRACE + Suppress(Literal('COMPANY')) + company + RBRACE
    ENTITY = LBRACE + Suppress(Literal('ENTITY')) + entity + RBRACE
    GREATERTHAN = LBRACE + Literal('GREATERTHAN') + Suppress(WRD) + RBRACE
    LESSTHAN = LBRACE + Literal('LESSTHAN') + Suppress(WRD) + RBRACE
    EQUAL = LBRACE + Literal('EQUAL') + Suppress(WRD) + RBRACE
    GTEQUAL = LBRACE + Literal('GTEQUAL') + Suppress(WRD) + RBRACE
    LTEQUAL = LBRACE + Literal('LTEQUAL') + Suppress(WRD) + RBRACE
    USD = LBRACE + Literal('USD') + Suppress(Regex("[$]+")) + RBRACE
    UNIT = LBRACE + Literal('UNIT') + USD + RBRACE
    EQUALITY = LBRACE + Suppress(Literal('EQUALITY')) + (GREATERTHAN ^ LESSTHAN ^ EQUAL ^ GTEQUAL ^ LTEQUAL) + RBRACE
    QUANTITY = LBRACE + Suppress(Literal('QUANTITY')) + Optional(UNIT) + CD + RBRACE
    QUANTIFIER = LBRACE + Suppress(Literal('QUANTIFIER')) + EQUALITY + QUANTITY + RBRACE

    #PYPARSING - AST parsing rules
    FILTER = Group(LBRACE + Literal('FILTER') + (ATTRIBUTE ^ RELATION) + RBRACE)
    MODIFIER = Group(LBRACE + Literal('MODIFIER') + (DATE ^ QUANTIFIER) + RBRACE)
    FUNCTIONLIST = Forward()
    FUNCTION = LBRACE + Suppress(Literal('FUNCTION')) + FILTER + Optional(MODIFIER) + RBRACE
    FUNCTIONLIST << LBRACE + Suppress('FUNCTIONLIST') + FUNCTION + Optional(FUNCTIONLIST) + RBRACE
    SUBJECT = LBRACE + Suppress(Literal('SUBJECT')) + (ENTITY ^ COMPANY) + RBRACE
    FILTEROBJECT = Group(LBRACE + Literal('FILTEROBJECT') + REPORT + RBRACE)
    DSLI = Group(LBRACE + Literal('DSLI') + (SUBJECT ^ FUNCTION) + RBRACE)
    QBODY = Forward()
    QUERYOBJ = LBRACE + Suppress(Literal("QUERYOBJ")) + (DSLI ^ FILTEROBJECT ^ UKWORD) + RBRACE
    QBODY << LBRACE + Suppress(Literal('QBODY')) + QUERYOBJ + Optional(QBODY) + RBRACE
    IS = LBRACE + Suppress(Literal('IS')) + (BE ^ BED ^ BEDZ ^ BER ^ BEZ) + RBRACE
    WHICHQ = LBRACE + Suppress(Literal('WHICHQ')) + WPS + IS + QBODY + RBRACE
    HOWQ = LBRACE + Suppress(Literal('WHICHQ')) + WRB + IS + QBODY + RBRACE
    WHATQ = LBRACE + Suppress(Literal('WHICHQ')) + WDT + IS + QBODY + RBRACE
    QUESTION = Group(LBRACE + Suppress(Literal('QUESTION')) + (WHICHQ ^ HOWQ ^ WHATQ ^ QBODY) + RBRACE)
    QUERY = LBRACE + Suppress(Literal('QUERY')) + OneOrMore(QUESTION) + RBRACE

    DSLOBJ = Suppress(SkipTo(company ^ FILTER)) + (company ^ FILTER)

    def __init__(self, tokens):
        """init parser with tokens and parser build from CFG
        :param tokens: tagged query tokens
        """
        self.tokens = tokens
        self.CFGParser = ChartParser(self.__getCFG())

    def _getAST(self):
        """Gets the words from the token list and passes them
        through the parser to build an AST
        :return nltk AST
        """
        parseTokens = [t[0] for t in self.tokens]
        ASTs = []
        try:
            syntaxTrees = self.CFGParser.parse(parseTokens)
            for tree in syntaxTrees:
                ASTs.append(tree)
                devLogger.info("AST generated: " + str(tree))
            if not(len(ASTs)):
                devLogger.warn("Did not generate any AST. AST list empty.")
        except Exception as e:
            devLogger.error("Could not parse tokens into AST: " + str(e))
        return ASTs

    def __getCFG(self):
        """Creates the CFG by combining the class defined rules,
        the standard preterminal rules for POS tags -> e, and
        finally the POS to word rules for the given query
        :return nltk CFG
        """
        tg = tokenGrammar
        for t in self.tokens:
            tg += "\n" + t[1] + ' -> ' + "'" + t[0] + "'"
            devLogger.info("Preterminal added to grammar: " + str(t))
        return nltk.CFG.fromstring(tg)

    def parseAST(self):
        """Parses the NLTK AST into a DSL string and view filters
        :return (List(DSL String),List(Filter references))
        """
        ast = self._getAST()
        dslItems = []
        filterObjects = []

        #TODO right now only consider the first AST. In furutre we will have to pick best AST
        if len(ast) >= 1:
            astLimmited = ast[0]
        else:
            astLimmited = False

        if astLimmited:
            try:
                parsedAST = self.QUERY.parseString(astLimmited.pprint())
                devLogger.info("Parsed AST: " + str(parsedAST))
            except Exception as e:
                parsedAST = []
                devLogger.error("Could not parse AST: " + str(e))
            for parsed in parsedAST.asList():
                filterObjects = [self.getFilterObjects(item) for item in parsed if item[0] == 'FILTEROBJECT']
                dslStr = DSLString(filterObjects)
                for item in parsed:
                    if item[0] == 'DSLI':
                        dslStr.addDSLI(item[1:])
                dslItems.append(dslStr.getString())

        if len(filterObjects) < 1:
                filterObjects = [DefaultDataFilter]

        devLogger.info('DSL query list is: ' + str(dslItems))
        devLogger.info('Filter reference list is: ' + str(filterObjects))
        return dslItems, filterObjects


    def getFilterObjects(self, parsedItem):
        """Links to the appropriate filter class
        :param parsedItems: List(List()) of parsed query items
        :return Filter reference
        """
        def filterSwitch(x):
            return {
                'CASHFLOW': CashFlowFilter,
                'BALANCESHEET': BalanceSheetFilter,
                'INCOMESTMT': IncomeStatementFilter,
            }.get(x, False)

        return filterSwitch(parsedItem[1][0])
Ejemplo n.º 37
0
 def __init__(self, tokens):
     """init parser with tokens and parser build from CFG
     :param tokens: tagged query tokens
     """
     self.tokens = tokens
     self.CFGParser = ChartParser(self.__getCFG())
Ejemplo n.º 38
0
	def reload( self ):
		self.load_sents()
		self.load_grammar()
		self.parser = ChartParser( self.grammar )
		return
Ejemplo n.º 39
0
def make_sentence(corpus, term_rules, *args, **kwargs):
    '''
    
    Generate sentences with random structure and word choice
    using a context-free grammar
    
    The start point is taken from the sentence itself.
    
    Parameters
    ----------
    
    corpus : str
        a string containing the full, cleaned corpus
        
    term_rules : str
        a string containing all the terminal rules for the corpus
        
    maxdepth : int
        The maximum allowed recursion depth before throwing a
        ValueError
        
    fixed_grammar : bool
        Turn off the random sentence selection and used a fixed grammar
        instead.
    
    sample_sentence : str
        When fixed_grammar is turned on, this is the sentence that will
        be parsed. This can be finicky with grammars containing specially
        punctuated constructions like quotations or positions

    args[0] : dict()
        Optional: a dictionary of kgrams and their subsequent words. If this
        variable exists then cfgen will use this to pick the next words with
        conditional weighting (The prescence of this argument turns on Markov
        text generation features.)
        
    Notes
    -----
    
    Add the ability to turn off the kgram parsing, ideally by counting
    the number of unnamed arguments
    ----> Added this option
    
    '''

    markov_flag = (not len(args) == 0)
    if markov_flag:
        kgram_dict = args[0]

    fixed_grammar = kwargs.pop('fixed_grammar', False)
    sample_sentence = kwargs.pop('sample_sentence', '')
    maxdepth = kwargs.pop('maxdepth', 25)

    if fixed_grammar:
        if sample_sentence == '':
            warnings.warn('When using fixed_grammar, user should specify ' \
                          'the keyword argument "sample_sentence." Using a default simple sentence.')
            sample_sentence = 'The cow jumped over the moon.'
        else:
            pass

    flag = False
    attempts = 0
    while not flag and attempts < 30:
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

        if has_parser and not fixed_grammar:
            rsent = choice(tokenizer.tokenize(corpus))
        elif fixed_grammar:
            rsent = sample_sentence
        elif not has_parser and not fixed_grammar:
            # select from a parsed corpus of pre-approved grammars
            print("Usage library being built")
            rsent = "The dog walked up the stairs slowly."
        else:
            print("Usage library being built")
            rsent = "The dog walked up the stairs slowly."

        parsed_syntax = parse_sentence(rsent)
        # print(parsed_syntax)
        cfg_str = term_rules + parsed_syntax
        try:
            startpt = parsed_syntax[:parsed_syntax.find(' ->')]
            startpt = nltk.grammar.Nonterminal(startpt)
            grammar = CFG.fromstring(cfg_str)
            parser = ChartParser(grammar)
            gr = parser.grammar()
            if markov_flag:
                out_txt = (' '.join(
                    produce_kgram(gr,
                                  startpt,
                                  kgram_dict,
                                  maxdepth=maxdepth,
                                  sent=[])))
            else:
                out_txt = (' '.join(produce(gr, startpt, maxdepth=maxdepth)))
            flag = True
        except ValueError:
            warnings.warn(
                'Badly formed sentence encountered, resampling the corpus.')
            attempts = attempts + 1

    # now re-tag special characters
    swappairs = zip(replacements, to_replace)
    for member in swappairs:
        out_txt = out_txt.replace(member[0], member[1])

    return out_txt
Ejemplo n.º 40
0
Sp -> P
Sa -> 'tells' 'you' 'that' | 'says' | 'says' 'that' | 'claims' | 'claims' 'that' | 'tells you'
St -> PG Is Class | PG Quant Is Class | 
Quant -> Comp Count
Comp -> 'exactly'
Count -> 'one'
Not -> 'neither' | 'nor' 
PG -> 'i' | PG PG | Not P | P | 'of' PG | PG 'and' PG
P -> 'zoey' | 'mel' | 'peggy' | 'zippy' | 'sue' | 'sally' | 'homer' | 'bozo' | 'marge' | 'zed' | 'alice' | 'ted' | 'bart' | 'bob' | 'betty'
Is -> 'is' 'a' | 'are'
Class -> Kni | Kna
Kni -> 'knight' | 'knights'
Kna -> 'knave' | 'knaves'
""")

def preprocess(sent):
    return "".join([letter for letter in sent.lower() if letter in "qwertyuiopasdfghjklzxcvbnm "]).split()

sents = ["Zoey tells you that mel is a Knave",
         "Mel says, `Neither Zoey nor I are knaves.'",
         "Peggy tells you that 'of Zippy and I, exactly one is a knight'."]
sents = [preprocess(sent) for sent in sents]
parser = ChartParser(kk_grammar)
for sent in sents:
    for tree in parser.parse(sent):
        print(tree)




import nltk
from nltk import ChartParser

# Load grammar.
grammar = nltk.data.load('../../Grammar/full_grammar.cfg')
parser = ChartParser(grammar)

with open('human_chunks.txt') as f:
    noun_chunks = [line.strip().split() for line in f]

not_covered = []
for chunk in noun_chunks:
    try:
        result = parser.parse(chunk)
        print(f"Valid: {chunk}")
    except ValueError:
        print(f"Not covered: {chunk}")
        chunk = ' '.join(chunk) + '\n'
        not_covered.append(chunk)

with open("not-covered.txt", 'w') as f:
    f.writelines(not_covered)

num_chunks = len(noun_chunks)
num_covered = len(noun_chunks) - len(not_covered)
num_not_covered = len(not_covered)
print(f"Number of unique noun chunks: {num_chunks}")
print(f"Covered: {num_covered} ({(num_covered/num_chunks) * 100}%)")
print(
    f"Not covered: {num_not_covered} ({(num_not_covered/num_chunks) * 100}%)")