Esempio n. 1
0
    def test_production_from_grammar(self):
        grammar_str = """
        S -> NP VP
        PP -> P NP
        NP -> Det N | NP PP
        VP -> V NP | VP PP
        Det -> 'a' | 'the'
        N -> 'dog' | 'cat'
        V -> 'chased' | 'sat'
        P -> 'on' | 'in'
        """

        grammar = parse_cfg(grammar_str)
        productions = grammar.productions()

        expect_production = Production(
            lhs=Nonterminal("S"), rhs=[Nonterminal("NP"),
                                       Nonterminal("VP")])
        error_msg = "Expect to find '{}', but can not see in \n{}".format(
            expect_production, grammar_str)
        self.assertIn(expect_production, productions, error_msg)

        expect_production = Production(lhs=Nonterminal("N"), rhs=['dog'])
        error_msg = "Expect to find '{}', but can not see in \n{}".format(
            expect_production, grammar_str)
        self.assertIn(expect_production, productions, error_msg)

        expect_not_in = Production(lhs="S", rhs=["NP", "VP"])
        self.assertNotIn(expect_not_in, productions, error_msg)

        expect_not_in = Production(lhs=Nonterminal("N"), rhs=["'dog'"])
        self.assertNotIn(expect_not_in, productions, error_msg)
Esempio n. 2
0
def demo():
    """
    A demonstration of the recursive descent parser.
    """

    from nltk import parse, parse_cfg

    grammar = parse_cfg("""
    S -> NP VP
    NP -> Det N | Det N PP
    VP -> V NP | V NP PP
    PP -> P NP
    NP -> 'I'
    N -> 'man' | 'park' | 'telescope' | 'dog'
    Det -> 'the' | 'a'
    P -> 'in' | 'with'
    V -> 'saw'
    """)

    for prod in grammar.productions():
        print(prod)

    sent = 'I saw a man in the park'.split()
    parser = parse.RecursiveDescentParser(grammar, trace=2)
    for p in parser.nbest_parse(sent):
        print(p)
Esempio n. 3
0
def app():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """
    from nltk.grammar import parse_cfg

    grammar = parse_cfg(
        """
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """
    )

    sent = "the dog saw a man in the park".split()

    RecursiveDescentApp(grammar, sent).mainloop()
Esempio n. 4
0
def demo():
    N = 42
    print('Generating the first %d sentences for demo grammar:' % (N, ))
    print(demo_grammar)
    grammar = parse_cfg(demo_grammar)
    for n, sent in enumerate(generate(grammar, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))
Esempio n. 5
0
def demo():
    N = 42
    print('Generating the first %d sentences for demo grammar:' % (N,))
    print(demo_grammar)
    grammar = parse_cfg(demo_grammar)
    for n, sent in enumerate(generate(grammar, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))
Esempio n. 6
0
def _generate_demo():
    g = parse_cfg("""
      S -> NP VP
      NP -> Det N
      VP -> V NP
      Det -> 'the'
      Det -> 'a'
      N -> 'man' | 'park' | 'dog' | 'telescope'
      V -> 'saw' | 'walked'
      P -> 'in' | 'with'
    """)
    for s in generate(g):
        print ' '.join(s)
def _generate_demo():
    g = parse_cfg("""
      S -> NP VP
      NP -> Det N
      VP -> V NP
      Det -> 'the'
      Det -> 'a'
      N -> 'man' | 'park' | 'dog' | 'telescope'
      V -> 'saw' | 'walked'
      P -> 'in' | 'with'
    """)
    for s in generate(g):
        print(' '.join(s))
Esempio n. 8
0
def generateRawTemplates(depth):
    gram = parse_cfg(grammarstring)
    rawTemplates = generate(gram, depth=depth)
    templatefiles = []

    for index, state in enumerate(rawTemplates):
        filename = os.path.join("./templates","template"+str(index))
        with open(filename, 'w') as templatefile:
            templatefile.write(' '.join(state))
            templatefiles.append(filename)

    print str(len(rawTemplates))+" template files generated"

    return templatefiles
Esempio n. 9
0
def parseSentence(toks, grammarWoTerm, cfg):
  if cfg:
    gr = grammar.parse_cfg(grammarWoTerm)
    parser = parse.BottomUpChartParser(gr)
  else:
    termRules=[]
    for kr in toks:
      termRules.append(kr2terminals.getRuleFromKr(kr))
    
    fullGrammar = '\n'.join(grammarWoTerm+termRules)
  
    gr = grammar.parse_fcfg(fullGrammar) 
    parser = parse.FeatureBottomUpChartParser(gr)
  
  
  chart = parser.chart_parse(toks)
  return chart  
Esempio n. 10
0
    def __init__(self, grammar, length=1):
        """Convert the grammar to Chomsky Normal Form and do preprocessing.
        
        `grammar` can be:
            (1) an instance of nltk.grammar.ContextFreeGrammar,
            (2) a string representing the path to a .cfg file, or
            (3) a string that can be parsed into a grammar by parse_cfg

        `length` is the maximum string length that should be preprocessed.
        """
        if length < 1:
            raise ValueError('length must be greater than 0.')

        # self.grammar must be instance of nltk.grammar.Grammar
        if isinstance(grammar, ContextFreeGrammar): 
            self.grammar = grammar
        elif isinstance(grammar, str) and grammar.endswith('.cfg'):
            self.grammar = nltk.data.load('file:' + grammar)
        elif isinstance(grammar, str):
            self.grammar = parse_cfg(grammar)
        else:
            raise ValueError('Arg grammar must be nltk.grammar.Grammar or str.')
        
        if not self.grammar.is_chomsky_normal_form():
            #raise ValueError('Input grammar must be in CNF '
            #                 '(conversion method isn\'t implemented)')
            self.grammar = convert_to_cnf(self.grammar)
            assert self.grammar.is_chomsky_normal_form()

        self.productions = self.grammar.productions()

        # TODO: Is it ok to assume all nonterminals occur on a LHS?
        # Technically yes, but check whether nltk's is_cnf ensures it.
        self.nonterminals = set([p.lhs() for p in self.productions])

        self.terminals = set([token for prod in self.productions 
                              for token in prod.rhs()
                              if not isinstance(token, Nonterminal)])

        # Initialize self._counts then populate it in _preprocess(). 
        # self.length is the string length that has been preprocessed.
        self._counts = {}
        self.length = 0
        self._preprocess(length)
Esempio n. 11
0
def app():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """
    from nltk.grammar import parse_cfg
    grammar = parse_cfg("""
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """)

    sent = 'the dog saw a man in the park'.split()

    RecursiveDescentApp(grammar, sent).mainloop()
Esempio n. 12
0
def demo():
    """
    A demonstration of the shift-reduce parser.
    """

    from nltk import parse, parse_cfg

    grammar = parse_cfg("""
    S -> NP VP
    NP -> Det N | Det N PP
    VP -> V NP | V NP PP
    PP -> P NP
    NP -> 'I'
    N -> 'man' | 'park' | 'telescope' | 'dog'
    Det -> 'the' | 'a'
    P -> 'in' | 'with'
    V -> 'saw'
    """)

    sent = 'I saw a man in the park'.split()

    parser = parse.ShiftReduceParser(grammar, trace=2)
    for p in parser.nbest_parse(sent):
        print p
Esempio n. 13
0
def demo():
    """
    A demonstration of the shift-reduce parser.
    """

    from nltk import parse, parse_cfg

    grammar = parse_cfg("""
    S -> NP VP
    NP -> Det N | Det N PP
    VP -> V NP | V NP PP
    PP -> P NP
    NP -> 'I'
    N -> 'man' | 'park' | 'telescope' | 'dog'
    Det -> 'the' | 'a'
    P -> 'in' | 'with'
    V -> 'saw'
    """)

    sent = 'I saw a man in the park'.split()

    parser = parse.ShiftReduceParser(grammar, trace=2)
    for p in parser.nbest_parse(sent):
        print p
Esempio n. 14
0
 def test_from_cfg(self):
     grammar = parse_cfg("""S -> 's' | A B\n A -> 'a'\n B -> 'b'""")
Esempio n. 15
0
	def __init__(self, Tagfile = "dummy.crf", Grammar = "dummy.cfg", Promptings = "dummy.prmz", Defsfile = "dummy.best",\
					Topic = "Potpourri", IM = False, TRON = 1):
		"""Initialize the grammar by first loading in the file of acceptible Evidence Implies Inference (EII) format parses,
			then reading in and reformatting the input sentence, before testing its acceptibility and printing output
			accordingly."""

		if (TRON > 2):	print Tagfile, Grammar, Promptings, Defsfile, Topic, IM, TRON
		print Tagfile, Grammar, Promptings, Defsfile, Topic, IM, TRON

		## Zerost, set the file and parameter defaults.
		self.tagfile = Tagfile
		self.grammar = Grammar
		self.promptings = Promptings
		self.defsfile = Defsfile
		self.old_topic = Topic
		self.for_IM = IM
		self.nolines = 0
		self.N = 4
		self.new_topic = self.old_topic
		self.noun_tags = ["NN", "NNS", "FW"]
		self.promptText = "\tUSER>  "
		
		## Flag for whether this is the initial run, used for resolving word definitions.
		initial = True
		altered = dict()
		words = dict()

		## First, read in the grammar file.
		str = ''
		gramfile = open(self.grammar, 'r')
		for line in gramfile.readlines():
			str += line
			gramfile.close()
		
		## Then, initialize the tagger.
		# HMMT = hmmtagger.getHMMTagger(self.tagfile, self.nolines)
		# self.HMMTagger = HMMT.trainHMM()
		(self.SeqTagger, self.train_data) = self.getTagger(self.tagfile, self.nolines, self.N)
		if (TRON > 1):	self.write("This tagger is %0.4f%% accurate." % self.score())

		## Next, we will read in the ambiguous sentence prompts and conversions.
		(self.messages, self.transforms) = self.init_messages(self.promptings)
		self.ideal = self.getBest(self.defsfile)

		received = ""
		playing = True
		self.write("Welcome to Revelator!\n")
		while (playing):

			## After that, read in the play or commands from the user.  For now, we will take only one at a time.  
			## TODO 1:  Adapt for multiple sentences per input.  Consider batch_parse and the like.
			## We will keep taking input until the user wishes to quit.
			received = self.getPlay(self.promptText)

			## Skip the rest of stuff if the user just types in a command.
			ternary = self.aCommand(received)
			if (ternary != 0):
				self.write("")
				if (ternary == -1):	playing = False
				continue

			## And this lets us self-define words.
			# (received, words, altered) = define.paran(received, words, altered, self.ideal)

			## We can self-define words in this part of the code.
			## N. B. that we will tokenize and tag the input twice, once before and once after, so that we can extract
			##   the paranthetical definitiions and not have them affect the final tagging.
			(words, altered, allNPs, received) = self.call_paranNP(received, words, altered, self.ideal, TRON > 2)

			## Bail-out code:
			# playing = False
			# print received
			# continue

			## Add the terminals to the grammar.  We will parse the if- and then-clauses separately.
			(if_clause, then_clause, str, pos_if, pos_then) = self.listen(str, received)
			if (TRON > 2):
				print "IF:  ", if_clause
				print "THEN:  ", then_clause

			## This gets rid of unnecessary punctuation.
			# if_clause = self.strip_punc(if_clause)
			# then_clause = self.strip_punc(then_clause)

			## Now, we get to the grammar.
			EII_grammar = grammar.parse_cfg(str)
			self.EII_Earley = EarleyChartParser(EII_grammar, trace = TRON)

			## Following that, we (re)define common nouns.
			(altered, self.ideal, words) = self.get_defs(if_clause.split(), pos_if.split(), altered, self.ideal, words, initial, allNPs)
			(altered, self.ideal, words) = self.get_defs(then_clause.split(), pos_then.split(), altered, self.ideal, words, initial, allNPs)

			## Verify the defintions (debug only).
			# for NP in allNPs.values():
			#	print "NP", NP, "is defined as", words[NP], "."

			## Then, test this sentence against our grammar.
			## IF clause:
			valid_if = self.parse(if_clause, pos_if, "evidence")
			## THEN clause:
			valid_then = self.parse(then_clause, pos_then, "inference")
			initial = False

			## If valid, print a copy of the evidence and inference to STDOUT.  Otherwise, tell the user what was wrong.
			if (valid_if and valid_then):
				self.printEI(if_clause, then_clause, altered, words, allNPs)
			elif (valid_if):
				self.write("I was able to understand your evidential statement, but your inferential statement did not parse.  Could you please restate\nyour entire play?")
			else:
				self.write("I was able to understand your statement of inference, but your statement of evidence did not parse.  Could you please restate\nyour entire play?")

			## Reset defaults.
			self.promptText = "\tUSER>  "
		self.write("Have a nice day!  Come play again sometime soon!\n")
                for frag in _multiply(frag1, frag2):
                    frags.append(frag)
    return frags


def _multiply(frag1, frag2):
    frags = []
    if len(frag1) == 1:
        frag1 = [frag1]
    if len(frag2) == 1:
        frag2 = [frag2]
    for f1 in frag1:
        for f2 in frag2:
            frags.append(f1 + f2)
    return frags


grammar = parse_cfg("""
  S -> NP VP
  NP -> Det N
  VP -> V NP
  Det -> 'the'
  Det -> 'a'
  N -> 'man' | 'park' | 'dog' | 'telescope'
  V -> 'saw' | 'walked'
  P -> 'in' | 'with'
""")

for sent in generate(grammar):
    print ' '.join(sent)
Esempio n. 17
0
def load(resource_url,
         format='auto',
         cache=True,
         verbose=False,
         logic_parser=None,
         fstruct_parser=None):
    """
    Load a given resource from the NLTK data package.  The following
    resource formats are currently supported:
      - C{'pickle'}
      - C{'yaml'}
      - C{'cfg'} (context free grammars)
      - C{'pcfg'} (probabilistic CFGs)
      - C{'fcfg'} (feature-based CFGs)
      - C{'fol'} (formulas of First Order Logic)
      - C{'logic'} (Logical formulas to be parsed by the given logic_parser)
      - C{'val'} (valuation of First Order Logic model)
      - C{'raw'}

    If no format is specified, C{load()} will attempt to determine a
    format based on the resource name's file extension.  If that
    fails, C{load()} will raise a C{ValueError} exception.

    @type resource_url: C{str}
    @param resource_url: A URL specifying where the resource should be
        loaded from.  The default protocol is C{"nltk:"}, which searches
        for the file in the the NLTK data package.
    @type cache: C{bool}
    @param cache: If true, add this resource to a cache.  If C{load}
        finds a resource in its cache, then it will return it from the
        cache rather than loading it.  The cache uses weak references,
        so a resource wil automatically be expunged from the cache
        when no more objects are using it.
        
    @type verbose: C{bool}
    @param verbose: If true, print a message when loading a resource.
        Messages are not displayed when a resource is retrieved from
        the cache.
    
    @type logic_parser: C{LogicParser}
    @param logic_parser: The parser that will be used to parse logical 
    expressions.
    @type fstruct_parser: C{FeatStructParser}
    @param fstruct_parser: The parser that will be used to parse the
    feature structure of an fcfg.
    """
    # If we've cached the resource, then just return it.
    if cache:
        resource_val = _resource_cache.get(resource_url)
        if resource_val is not None:
            if verbose:
                print '<<Using cached copy of %s>>' % (resource_url, )
            return resource_val

    # Let the user know what's going on.
    if verbose:
        print '<<Loading %s>>' % (resource_url, )

    # Determine the format of the resource.
    if format == 'auto':
        resource_url_parts = resource_url.split('.')
        ext = resource_url_parts[-1]
        if ext == 'gz':
            ext = resource_url_parts[-2]
        format = AUTO_FORMATS.get(ext)
        if format is None:
            raise ValueError('Could not determine format for %s based '
                             'on its file\nextension; use the "format" '
                             'argument to specify the format explicitly.' %
                             resource_url)

    # Load the resource.
    if format == 'pickle':
        resource_val = pickle.load(_open(resource_url))
    elif format == 'yaml':
        resource_val = yaml.load(_open(resource_url))
    elif format == 'cfg':
        resource_val = cfg.parse_cfg(_open(resource_url).read())
    elif format == 'pcfg':
        resource_val = cfg.parse_pcfg(_open(resource_url).read())
    elif format == 'fcfg':
        resource_val = cfg.parse_fcfg(_open(resource_url).read(),
                                      logic_parser=logic_parser,
                                      fstruct_parser=fstruct_parser)
    elif format == 'fol':
        resource_val = sem.parse_logic(_open(resource_url).read(),
                                       logic_parser=sem.logic.LogicParser())
    elif format == 'logic':
        resource_val = sem.parse_logic(_open(resource_url).read(),
                                       logic_parser=logic_parser)
    elif format == 'val':
        resource_val = sem.parse_valuation(_open(resource_url).read())
    elif format == 'raw':
        resource_val = _open(resource_url).read()
    else:
        assert format not in FORMATS
        raise ValueError('Unknown format type!')

    # If requested, add it to the cache.
    if cache:
        try:
            _resource_cache[resource_url] = resource_val
        except TypeError:
            # We can't create weak references to some object types, like
            # strings and tuples.  For now, just don't cache them.
            pass

    return resource_val
Esempio n. 18
0
def demo():
    print('Generating all sentences for demo grammar:')
    print(demo_grammar)
    grammar = parse_cfg(demo_grammar)
    for sent in generate(grammar):
        print(' '.join(sent))
Esempio n. 19
0
def _multiply(frag1, frag2):
    frags = []
    if len(frag1) == 1:
        frag1 = [frag1]
    if len(frag2) == 1:
        frag2 = [frag2]
    for f1 in frag1:
        for f2 in frag2:
            frags.append(f1+f2)
    return frags

grammar = parse_cfg(
    """
    S -> NP VP
    NP -> Det N | Pro
    NP_PP -> Det N_PP
    PP -> P NP_PP
    VP -> V PP
    Det -> 'the'
    Det -> 'a'
    Pro -> 'he' | 'she' | 'they' | 'we'
    N -> 'man' | 'boy' | 'person' | 'woman' | 'girl'
    N_PP -> 'store' | 'supermarket'
    V -> 'went' | 'walked' | 'drove' | 'ran'
    P -> 'to'
    """)

for sent in generate(grammar):
    print ' '.join(sent)
    
Esempio n. 20
0
            for frag2 in _generate_all(grammar, items[1:]):
                for frag in _multiply(frag1, frag2):
                    frags.append(frag)
    return frags
            
def _multiply(frag1, frag2):
    frags = []
    if len(frag1) == 1:
        frag1 = [frag1]
    if len(frag2) == 1:
        frag2 = [frag2]
    for f1 in frag1:
        for f2 in frag2:
            frags.append(f1+f2)
    return frags

grammar = parse_cfg("""
  S -> NP VP
  NP -> Det N
  VP -> V NP
  Det -> 'the'
  Det -> 'a'
  N -> 'man' | 'park' | 'dog' | 'telescope'
  V -> 'saw' | 'walked'
  P -> 'in' | 'with'
""")

for sent in generate(grammar):
    print ' '.join(sent)
    
Esempio n. 21
0
def demo():
    print('Generating all sentences for demo grammar:')
    print(demo_grammar)
    grammar = parse_cfg(demo_grammar)
    for sent in generate(grammar):
        print(' '.join(sent))
Esempio n. 22
0
def _generate_all(grammar, items, depth):
    if items:
        for frag1 in _generate_one(grammar, items[0], depth):
            for frag2 in _generate_all(grammar, items[1:], depth):
                yield frag1 + frag2
    else:
        yield []

def _generate_one(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth-1):
                    yield frag
        else:
            yield [item]

radio_grammar = """
S -> NP VP PP
	NP -> NNP Det Det NNP
	VP -> 'is' Det N N Det
	PP -> JJ JJ NNP
	NNP -> 'In Rainbows' | 'Basement' | 'Radiohead'
	JJ -> 'English' | 'alternative' | 'rock' | 'band'
	Det -> 'the' | 'a' | 'by' | 'from'
	N -> 'video' | 'album' | 'rock' | 'band'
"""
grammar = parse_cfg(radio_grammar)
for n, sent in enumerate(generate(grammar, n=100), 1):
        print('%3d. %s' % (n, ' '.join(sent)))