def test_simple(self): grammar = CFG.fromstring( """ S -> NP VP PP -> P NP NP -> Det N | NP PP P VP -> V NP | VP PP VP -> Det Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """ ) self.assertFalse(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) grammar = grammar.chomsky_normal_form(flexible=True) self.assertTrue(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) grammar2 = CFG.fromstring( """ S -> NP VP NP -> VP N P VP -> P N -> 'dog' | 'cat' P -> 'on' | 'in' """ ) self.assertFalse(grammar2.is_flexible_chomsky_normal_form()) self.assertFalse(grammar2.is_chomsky_normal_form()) grammar2 = grammar2.chomsky_normal_form() self.assertTrue(grammar2.is_flexible_chomsky_normal_form()) self.assertTrue(grammar2.is_chomsky_normal_form())
def test_simple(self): grammar = CFG.fromstring( """ S -> NP VP PP -> P NP NP -> Det N | NP PP P VP -> V NP | VP PP VP -> Det Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """ ) self.assertFalse(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) grammar = grammar.chomsky_normal_form(flexible=True) self.assertTrue(grammar.is_flexible_chomsky_normal_form()) self.assertFalse(grammar.is_chomsky_normal_form()) grammar2 = CFG.fromstring( """ S -> NP VP NP -> VP N P VP -> P N -> 'dog' | 'cat' P -> 'on' | 'in' """ ) self.assertFalse(grammar2.is_flexible_chomsky_normal_form()) self.assertFalse(grammar2.is_chomsky_normal_form()) grammar2 = grammar2.chomsky_normal_form() self.assertTrue(grammar2.is_flexible_chomsky_normal_form()) self.assertTrue(grammar2.is_chomsky_normal_form())
class Grammar(object): def __init__(self, dev=False): super(Grammar, self).__init__() self.dev = dev grammar1 = CFG.fromstring(""" S -> NP VP NP -> "DT" Nom | "NNP" | "PRP" Nom -> "JJ" Nom | N VP -> V "JJ" | V NP | V S | V NP PP | V "RB" V -> "VBD" | "VB" | "VBG" | "VBN" | "VBP" | "VBZ" N -> "NN" | "NNP" | "NNS" | "NNPS" PP -> "IN" NP """) grammar2 = CFG.fromstring(""" S -> NP VP NP -> "DT" Nom | "NNP" | "PRP" Nom -> "JJ" Nom | N | Nom N VP -> V "JJ" | V NP | V S | V NP PP | V "RB" | V PP | V V -> "VBD" | "VB" | "VBG" | "VBN" | "VBP" | "VBZ" N -> "NN" | "NNP" | "NNS" | "NNPS" PP -> "IN" NP | "TO" NP """) def buildFromTreebank(self): """ Build a Context-Free-Grammar based on UPenn treebank """ tbank_productions = set() for sent in treebank.parsed_sents(): for production in sent.productions(): if production.is_lexical(): new_rhs = [str(production._lhs)] production = Production(production._lhs, new_rhs) tbank_productions.add(production) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) return tbank_grammar def verify(self, grammar, tags): """ Verify tag sequence as grammatically correct or not """ # rd_parser = RecursiveDescentParser(grammar) rd_parser = ChartParser(grammar) valid = False try: for tree in rd_parser.parse(tags): valid = True break except ValueError: print "This is a grammatical structure I don't understand yet." return if valid: print "Valid" return True else: print "Invalid" return False
def chartParser(): """ 线图句法分析 """ from nltk.grammar import CFG from nltk.parse.chart import ChartParser, BU_LC_STRATEGY # BNF格式文法 开始符号:S 终结符号:单词 grammar = CFG.fromstring(""" S -> T1 T4 T1 -> NNP VBZ T2 -> DT NN T3 ->IN NNP T4 -> T3 | T2 T3 NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka' VBZ -> 'is' IN -> 'in' | 'of' DT -> 'the' NN -> 'capital' """) cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True) # trace=True可以看见分析过程 # strategy=BU_LC_STRATEGY是默认的,不写好像也行 sentence = 'Bangalore is the capital of Karnataka' tokens = sentence.split() chart = cp.chart_parse(tokens) # 对单词列表分析,并存到chart对象 parses = list(chart.parses(grammar.start())) # 将chart取到的所有分析树赋给parses print('Total Edges:', len(chart.edges())) # 输出chart对象所有边的数量 for tree in parses: print(tree) tree.draw()
def demo(): from nltk import Nonterminal, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] grammar = CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' Det -> 'my' NP -> 'I' N -> 'dog' N -> 'man' N -> 'park' N -> 'statue' V -> 'saw' P -> 'in' P -> 'up' P -> 'over' P -> 'with' """) def cb(grammar): print(grammar) top = Tk() editor = CFGEditor(top, grammar, cb) Label(top, text='\nTesting CFG Editor\n').pack() Button(top, text='Quit', command=top.destroy).pack() top.mainloop()
def app(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from nltk.grammar import CFG grammar = CFG.fromstring( """ # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """ ) sent = 'the dog saw a man in the park'.split() RecursiveDescentApp(grammar, sent).mainloop()
def demo(N=23): from nltk.grammar import CFG print('Generating the first %d sentences for demo grammar:' % (N,)) print(demo_grammar) grammar = CFG.fromstring(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print('%3d. %s' % (n, ' '.join(sent)))
def demo(): from nltk import Nonterminal, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] grammar = CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' Det -> 'my' NP -> 'I' N -> 'dog' N -> 'man' N -> 'park' N -> 'statue' V -> 'saw' P -> 'in' P -> 'up' P -> 'over' P -> 'with' """) def cb(grammar): print(grammar) top = Tk() editor = CFGEditor(top, grammar, cb) Label(top, text='\nTesting CFG Editor\n').pack() Button(top, text='Quit', command=top.destroy).pack() top.mainloop()
def demo(N=23): from nltk.grammar import CFG print('Generating the first %d sentences for demo grammar:' % (N, )) print(demo_grammar) grammar = CFG.fromstring(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print('%3d. %s' % (n, ' '.join(sent)))
def demo(N=26): from nltk.grammar import CFG senteneces = list() #print('Generating the first %d sentences for demo grammar:' % (N,)) #print(demo_grammar) grammar = CFG.fromstring(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print(' '.join(sent))
def banjoify(rules, song): arrangement = [] for pitch, duration in parse_abc(song): grammar = CFG.fromstring(rules.format(pitch=pitch)) options = list(generate(grammar, start=Nonterminal(duration))) phrase = random.choice(options) arrangement.append(''.join(phrase)) return ' '.join(arrangement)
def gensentence(N=20): print('Generating the first %d sentences for gensentence grammar:' % (N, )) #print(gensentence_grammar) grammar = CFG.fromstring(gensentence_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): global abc abc = {} abc = ' '.join(sent) print('%3d. %s' % (n, abc))
def generate_context_free_grammar_novel_text( self, number_of_words_in_sentence=0, number_of_sentences_per_record=0, number_of_records=0 ): """ This method utilizes NLTK's Context Free Grammar parser objects to parse an available .*cfg file and generate novel text from it. @param number_of_words_in_sentence: An indicator as to the number of words to generate in each novel sentence. @type number_of_words_in_sentence: int @param number_of_sentences_per_record: An indicator as to the number of sentences per record to generate. @type number_of_sentences_per_record: int @param number_of_records: An indicator as to the total number of records to generate. @type number_of_records: int @return: str """ words = [] punct_selector = [". ", "! ", "? "] punctuation_stop_symbols = dict((ord(char), None) for char in string.punctuation) parser = None grammar = None try: if isinstance(self._corpus, CFG): _grammar = self._corpus if _grammar is not None: parser = ChartParser(_grammar) grammar = parser.grammar elif isinstance(self._corpus, FeatureGrammar): _grammar = self._corpus if _grammar is not None: parser = FeatureChartParser(_grammar) grammar = parser.grammar() elif isinstance(self._corpus, PCFG): _grammar = self._corpus if _grammar is not None: parser = InsideChartParser(_grammar) grammar = parser.grammar() else: grammar = CFG.fromstring(self._corpus) if grammar is not None: for _ in range(number_of_records): novel_sentence = [] for _ in range(number_of_sentences_per_record): sentence = " ".join( [ sent for _, sent in enumerate(generate_text(grammar, depth=2, n=number_of_words_in_sentence)) ] ) sentence = sentence.translate(punctuation_stop_symbols) + random.choice(punct_selector) sentence = sentence[0:].capitalize() novel_sentence.append(sentence) words.append("".join(novel_sentence)) except Exception, error: self.logger.error( "TextGenerator.generate_context_free_grammar_novel_text: Error occurred - {0}".format(str(error)) )
def construct_cfg_from_string(): ''' Reads CFG rules from cfg.txt Uses nltk to make a grammar from the given rules :return: CFG (nltk.grammar.CFG) ''' f = open("cfg.txt", "r") grammar_string = f.readlines() grammar = CFG.fromstring(grammar_string) f.close() return grammar
def generate_text(grammar,N): from nltk.grammar import CFG import nltk.parse.generate as gen print('Generating the first %d sentences for demo grammar:' % (N,)) print(grammar) grammar = CFG.fromstring(grammar) grm_list = gen.generate(grammar, n=N) for n, sent in enumerate(grm_list): print('%3d. %s' % (n, ' '.join(sent)))
def generateRawTemplates(depth): gram = CFG.fromstring(grammarstring) rawTemplates = generate(gram, depth=depth) templatefiles = [] for index, state in enumerate(rawTemplates): filename = os.path.join("./templates","template"+str(index)) with open(filename, 'w') as templatefile: templatefile.write(' '.join(state)) templatefiles.append(filename) print str(len(templatefiles))+" template files generated" return templatefiles
def generateRawTemplates(depth): gram = CFG.fromstring(grammarstring) rawTemplates = generate(gram, depth=depth) templatefiles = [] for index, state in enumerate(rawTemplates): filename = os.path.join("./templates", "template" + str(index)) with open(filename, 'w') as templatefile: templatefile.write(' '.join(state)) templatefiles.append(filename) print str(len(templatefiles)) + " template files generated" return templatefiles
def generate_tweet(grammar): from nltk.grammar import CFG import nltk.parse.generate as gen print(grammar) grammar = CFG.fromstring(grammar) grm_list = gen.generate(grammar, n=SIZE) # TODO voir la taille max ? moyen de la recuperer ? from random import randint rd = randint(0,SIZE) cpt = 0 for n, sent in enumerate(grm_list): if rd == cpt: print ("Your tweet : ") print('%3d. %s' % (n, ' '.join(sent))) cpt += 1
def generate(filename, start=None, depth=None, n=None): """ Generates an iterator of all sentences from a CFG. :param filename: path to file containing grammar. :param start: The Nonterminal from which to start generate sentences. :param depth: The maximal depth of the generated tree. :param n: The maximum number of sentences to return. :return: An iterator of lists of terminal tokens. """ grammar = CFG.fromstring(_read_grammar(filename)) if not start: start = grammar.start() if depth is None: depth = sys.maxsize iter = _generate_all(grammar, [start], depth) if n: iter = itertools.islice(iter, n) return [' '.join(string).replace(' , ', ', ') for string in list(iter)]
def create_learner(): if isfile("imp_learner.p"): learner_file = open("imp_learner.p", "rb") learner = pickle.load(learner_file) learner_file.close() return learner else: grammar = CFG.fromstring(""" Pgm -> Id ',' Pgm | Stmt Stmt -> Block | Id '=' Aexp ';' | Stmt Stmt Stmt -> 'if(' Bexp ')' Block 'else' Block Stmt -> 'while(' Bexp ')' Block Block -> '{}' | '{' Stmt '}' Bexp -> 'true' | Aexp '<=' Aexp | '!' Bexp Bexp -> Bexp '&&' Bexp | Bexp '||' Bexp | '(' Bexp ')' Aexp -> Int | Id | Aexp '+' Aexp | Aexp '-' Aexp Aexp -> Aexp '*' Aexp | Aexp '/' Aexp | '(' Aexp ')' Id -> 'a' | 'b' Bool -> 'true' | 'false' Int -> '0' | '1' """) return PrimalLearner.from_grammar(grammar, k=1)
def generate_context_free_grammar_novel_text( self, corpus, number_of_words_in_sentence, number_of_sentences_per_record, number_of_records): ''' This method utilizes NLTK's Context Free Grammar parser objects to parse an available .*cfg file and generate novel text from it. Params: ------- - number_of_words_in_sentence (int): An indicator as to the number of words to generate in each novel sentence. - number_of_sentences_per_record (int): An indicator as to the number of sentences per record to generate. - number_of_records (int): An indicator as to the total number of records to generate. Returns: str ''' words = [] punct_selector = ['. ', '! ', '? '] punctuation_stop_symbols = dict( (ord(char), None) for char in string.punctuation) parser = None grammar = None try: if isinstance(corpus, CFG): _grammar = corpus if _grammar is not None: parser = ChartParser(_grammar) grammar = parser.grammar elif isinstance(corpus, FeatureGrammar): _grammar = corpus if _grammar is not None: parser = FeatureChartParser(_grammar) grammar = parser.grammar() elif isinstance(corpus, PCFG): _grammar = corpus if _grammar is not None: parser = InsideChartParser(_grammar) grammar = parser.grammar() else: grammar = CFG.fromstring(corpus) if grammar is not None: for _ in range(number_of_records): novel_sentence = [] for _ in range(number_of_sentences_per_record): sentence = ' '.join([ sent for _, sent in enumerate( generate_text(grammar, depth=2, n=number_of_words_in_sentence)) ]) sentence = sentence.translate( punctuation_stop_symbols) + random.choice( punct_selector) sentence = sentence[0:].capitalize() novel_sentence.append(sentence) words.append(''.join(novel_sentence)) except Exception, error: logging.error('TextGenerator: Error occurred - {0}'.format( str(error)))
# -*- coding: utf-8 -*- import pytest from nltk.grammar import CFG from nltk.parse.chart import BottomUpChartParser with open("subject-verb.grammar") as f: grammar = CFG.fromstring(f.read(), encoding="utf-8") tests = { "subject_verb_agreement": [ "Je regarde la television", "Tu regardes la television", "Il regarde la television", "Nous regardons la television", "Vous regardez la television", "Ils regardent la television" ], "test_noun_phrases_and_proper_names": [ "le chat", "la television", "les chats", "les televisions", "Jackie", "Montreal" ], "test_direct_object_pronouns": ["il la regarde"], "test_attribute_adjectives": [ "le chat noir", "le chat heureux", "le beau chat", "le joli chat", "la derniere semaine", "la semaine derniere", "les chats noirs", "la television noire", "les televisions noires" ] } @pytest.mark.parametrize("test", ((test_name, sentence) for test_name, sentences in tests.items() for sentence in sentences)) def test(test):
grammar = CFG.fromstring(""" S -> Fallback Err Fallback S -> Fallback Fallback -> AllTags Fallback Fallback -> S -> AllTags AllTags -> 'END' | 'QUOT' | '(' | ')' | ',' | '--' | '.' | 'CC' | 'CD' | 'DT' | 'EX' | 'FW' | 'IN' | 'JJ' | 'JJR' | 'JJS' | 'LS' | 'MD' | 'NN' | 'NNP' | 'NNPS' | 'NNS' | 'PDT' | 'POS' | 'PRP' | 'PRP$' | 'RB' | 'RBR' | 'RBS' | 'RP' | 'SYM' | 'TO' | 'UH' | 'VB' | 'VBD' | 'VBG' | 'VBN' | 'VBP' | 'VBZ' | 'WDT' | 'WP' | 'WP$' | 'WRB' | '``' | Det | ':' Det -> DetPl | DetSg | DetNeut DetNeut -> 'the' | 'some' | 'another' | 'no' | 'his' | 'her' | 'his/her' | 'any' DetSg -> 'a' | 'an' | 'this' | 'every' | 'another' | 'that' | 'each' | 'neither' DetPl -> 'all' | 'both' | 'these' | 'those' Err -> ErrUD | ErrAGD | ErrFD | ErrAGV NotNPHead -> 'END' | 'QUOT' | '(' | ')' | ',' | '--' | '.' | 'CC' | 'DT' | 'EX' | 'FW' | 'IN' | 'LS' | 'MD' | 'NN' | 'NNP' | 'NNPS' | 'NNS' | 'PDT' | 'POS' | 'PRP' | 'PRP$' | 'RB' | 'RBR' | 'RBS' | 'RP' | 'SYM' | 'TO' | 'UH' | 'VB' | 'VBD' | 'VBG' | 'VBN' | 'VBP' | 'VBZ' | 'WDT' | 'WP' | 'WP$' | 'WRB' | '``' | ':' CDList -> 'CD' CDList CDList -> JJList -> 'JJ' JJList JJList -> 'JJR' JJList JJList -> 'JJS' JJList JJList -> ErrAGD -> DetPl JJList 'NN' ErrAGD -> DetSg JJList CDList JJList 'NNS' ErrFD -> 'a' AllTags ErrFD -> 'an' AllTags ErrUD -> Det JJList 'NNP' ErrUD -> Det JJList CDList JJList 'NNPS' """)
# S -> NP VP # Start state S # A -> B | C # Arrow and vbar # C -> "a" | "b" # non-terminals in quotes # ################################################## # Replace with your file name here filename = "a2q2.txt" with open(filename) as f: content = f.read() # Spent too long on this and gave up; I just manually converted accents within the grammar file content = content.lower().replace('é', 'e').replace('è', 'e').replace('ê', 'e') \ .replace('á', 'a').replace('à', 'a').replace('â', 'a') \ .replace('ó', 'o').replace('ò', 'o').replace('ô', 'o') grammar = CFG.fromstring(content, encoding="utf-8") parser = BottomUpChartParser(grammar) def parse(sentence, nonempty): trees = parser.parse(sentence.lower().split()) data = list(trees) if nonempty: print(data) assert len(data) > 0 else: assert len(data) == 0 validSentences = [
grammar = CFG.fromstring(""" S -> COMMAND QUERY COMMAND -> COMMAND1 | COMMAND2 | COMMAND3 COMMAND1 -> 'tampil' COMMAND2 -> 'tunjuk' | 'lihat' COMMAND3 -> 'hitung' | 'kalkulasi' QUERY -> RELATION | CONDITION | CONDITION CONDITION | CONDITION CONJ CONDITION | CONDITION QUERY | CONDITION CONJ QUERY CONJ -> AND | OR AND -> 'dan' | 'serta' OR -> 'atau' CONDITION -> FIELDS OPERATOR NUMBER | FIELDS RELATION | FIELDS RELATION SPATIALOP RELCOND | FIELDS RELATION NOT SPATIALOP RELCOND | FIELDS RELCOND | PART RELATION SPATIALOP GEOCOND | RELCOND | RELATION SPATIALOP GEOCOND | RELATION NOT SPATIALOP GEOCOND | RELATION SPATIALOP RELCOND | RELATION NOT SPATIALOP RELCOND | SPATIALOP RELATION SPATIALOP RELCOND | SPATIALOP RELATION NOT SPATIALOP RELCOND | SPATIALOP RELCOND | SPATIALOP RELCOND RELCOND | SPATIALOP OPERATOR NUMBER | VALUES PART -> 'daerah' | 'bagian' | 'potong' GEOCOND -> GEOMETRY POINT COOR CONJ POINT COOR | GEOMETRY COOR SIZE NUMBER GEOMETRY -> SQUARE | RECTANGLE SQUARE -> 'persegi' RECTANGLE -> 'segiempat' | 'persegi' 'panjang' POINT -> LU | RU | LB | RB LU -> 'titik' 'kiri' 'atas' RB -> 'titik' 'kanan' 'bawah' RELCOND -> RELATION VALUES | RELATION FIELDS VALUE | RELATION FIELDS NUMBER | RELATION OPERATOR -> 'lebih' 'dari' | 'kurang' 'dari' | 'sama' 'dengan' | 'lebih' 'dari 'sama 'dengan' | 'kurang' 'dari' 'sama' 'dengan' NOT -> 'tidak' | 'bukan' SPATIALOP -> PANJANG | LUAS | KELILING | INSIDE | OUTSIDE | JARAK JARAK -> 'jarak' INSIDE -> 'dalam' | 'pada' OUTSIDE -> 'luar' PANJANG -> 'panjang' LUAS -> 'luas' KELILING -> 'keliling' FIELDS -> FIELD FIELD | FIELD | FIELD FIELDS | FIELD CONJ FIELDS VALUES -> VALUE VALUE | VALUE | VALUE VALUES """)
from nltk.grammar import CFG from nltk.parse.chart import ChartParser, BU_LC_STRATEGY grammar = CFG.fromstring(""" S -> T1 T4 T1 -> NNP VBZ T2 -> DT NN T3 -> IN NNP T4 -> T3 | T2 T3 NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka' VBZ -> 'is' IN -> 'in' | 'of' DT -> 'the' NN -> 'capital' """) cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True) sentence = "Bangalore is the capital of Karnataka" tokens = sentence.split() chart = cp.chart_parse(tokens) parses = list(chart.parses(grammar.start())) print("Total Edges :", len(chart.edges())) for tree in parses: print(tree) tree.draw()
from nltk.grammar import CFG from nltk.parse import EarleyChartParser cfg = CFG.fromstring(""" S -> NP VP NP -> DET NN NP -> DET NP NP -> JJ NN VP -> VB NP DET -> 'a' | 'the' JJ -> 'lucky' NN -> 'man' | 'woman' VB -> 'loves' | 'shoots' """) cfgparser = EarleyChartParser(cfg) s = 'a man loves a woman'.split() for tree in cfgparser.parse(s): print(tree.pformat()) tree.draw() s = 'the man shoots a woman'.split() for tree in cfgparser.parse(s): print(tree.pformat()) tree.draw() s = 'a lucky woman loves a man'.split() for tree in cfgparser.parse(s): print(tree.pformat()) tree.draw()
def load_grammar(grammar_path): logger.info('Loading grammar in %s' % grammar_path) with open(grammar_path) as fin: grammar_string = fin.read() return CFG.fromstring(grammar_string)