def test2(): from nltk.parse import RecursiveDescentParser rd = RecursiveDescentParser(grammar) sentence1 = 'the cat chased the dog'.split() sentence2 = 'the cat chased the dog on the rug'.split() print rd.parse(sentence2)
class GridGen(): def __init__(self): PrWd_rules = [] PrWd_rules_old = [''] for i in range(7): # set maximum level-1 length PrWd_rules_new = [x +' '+ y for x in PrWd_rules_old \ for y in ['x', 'o']] # level-1 grid PrWd_rules += PrWd_rules_new PrWd_rules_old = PrWd_rules_new[:] PrWd_rules = ['PrWd -> ' + x for x in PrWd_rules] # Culminativity (at least one level-1 grid mark) PrWd_rules = [x for x in PrWd_rules if re.search('x', x)] # Expansions of syllable preterminals Term_rules = ['x -> "σ"', 'o -> "σ"'] grammar_rules = PrWd_rules + Term_rules grammar_rulestr = '\n'.join(grammar_rules) grammar = CFG.fromstring(grammar_rulestr) print(f'# of productions in grammar: {len(grammar.productions())}') self.grammar = grammar self.parser = RecursiveDescentParser(grammar) def parses(self, inpt): T = [t for t in self.parser.parse(inpt.split())] T = [ParentedTree.convert(t) for t in T] return T
def cfg_en(self): print("test_nltk_cfg_en") # 定义英文语法规则 grammar = nltk.CFG.fromstring(""" S -> NP VP VP -> V NP | V NP PP V -> "saw" | "ate" NP -> "John" | "Mary" | "Bob" | Det N | Det N PP Det -> "a" | "an" | "the" | "my" N -> "dog" | "cat" | "cookie" | "park" PP -> P NP P -> "in" | "on" | "by" | "with" """) sent = "Mary saw Bob".split() rd_parser = RecursiveDescentParser(grammar) result = [] for i, tree in enumerate(rd_parser.parse(sent)): result.append(tree) assert len(result) > 0, " CFG tree parse fail." print(result)
def test_nltk_cfg_qtype(self): print("test_nltk_cfg_qtype") gfile = os.path.join(curdir, os.path.pardir, "config", "grammar.question-type.cfg") question_grammar = nltk.data.load('file:%s' % gfile) def get_missing_words(grammar, tokens): """ Find list of missing tokens not covered by grammar """ missing = [ tok for tok in tokens if not grammar._lexical_index.get(tok) ] return missing sentence = "what is your name" sent = sentence.split() missing = get_missing_words(question_grammar, sent) target = [] for x in sent: if x in missing: continue target.append(x) rd_parser = RecursiveDescentParser(question_grammar) result = [] print("target: ", target) for tree in rd_parser.parse(target): result.append(x) print("Question Type\n", tree) if len(result) == 0: print("Not Question Type")
def cfg_zh(self): grammar = nltk.CFG.fromstring(""" S -> N VP VP -> V NP | V NP | V N V -> "尊敬" N -> "我们" | "老师" """) sent = "我们 尊敬 老师".split() rd_parser = RecursiveDescentParser(grammar) result = [] for i, tree in enumerate(rd_parser.parse(sent)): result.append(tree) print("Tree [%s]: %s" % (i + 1, tree)) assert len(result) > 0, "Can not recognize CFG tree." if len(result) == 1: print("Draw tree with Display ...") result[0].draw() else: print("WARN: Get more then one trees.") print(result)
def Process(str, file): sr = RecursiveDescentParser(pgrammar) r = list(sr.parse(str.split())) if len(r) > 0: cadResult = GenerateCadFile(ParentedTree.convert(r[0])) cadResult.write(file) else: print("************* " + str)
def check_syntax(text): # here we list all possible languages that can be used in the grammar lang_pos = {} for l in pycountry.languages: p = pycountry.languages.get(name=l.name) try: alpha_kind = p.alpha_2 lang_pos[p.name]=alpha_kind except: pass lang_command = '''LANG ->''' for lg in lang_pos: if list(lang_pos.keys()).index(lg) == 0: lang_command+=' '+str(lang_pos[lg])+' ' else: lang_command+='''| '{x}' '''.format(x=lang_pos[lg]) # here we list all possible functions , given that the system was already re-written to accomodate # the changes func_command = '''FUNC ->''' for attr in dir(node_func): if dir(node_func).index(attr) == 0: func_command+=' '+str(attr)+' ' else: func_command+='''| '{x}' '''.format(x=attr) # here we substitute the pre made rules in the grammar itself grammar = CFG.fromstring((''' S -> 'plug' '<' FUNC '>' 'as' LANG | 'unplug' '<' LANG '>' command1 command2 '''.replace('command1',lang_command)).replace('command2',func_command)) grammar_rd = RecursiveDescentParser(grammar) # here we check the syntax and the lexical using the already described cfg for t in text.split('\n'): parsed = [] try: for tree in grammar_rd.parse(t.split()): parsed.append(tree) if len(parsed) != 0: print(parsed) pass else: return 'syntax error' return 'parsed' except: return 'syntax/lexical error'
def sensibility_test(transcribeText, backdoor): if backdoor: print('Sentence is sensible') return 1 else: grammar = nltk.data.load('grammars/book_grammars/drt.cfg') # sr = ShiftReduceParser(grammar=grammar) rd = RecursiveDescentParser() try: for t in rd.parse(transcribeText): print(t) print('Sentence is sensible') except: print('Sentence is not sensible')
class FootGen(): def __init__(self): # Expansions of PrWd PrWd_rules = [] PrWd_rules_old = [''] for i in range(5): PrWd_rules_new = [x+' '+y for x in PrWd_rules_old \ for y in ['MainFt', 'Ft', 'Syll']] PrWd_rules += PrWd_rules_new PrWd_rules_old = PrWd_rules_new[:] PrWd_rules = ['PrWd -> ' + x for x in PrWd_rules] # Culminativity (exactly one main-stress foot) PrWd_rules = [x for x in PrWd_rules if re.search('Main', x) \ and not re.search('Main.*Main', x)] #print(len(PrWd_rules)) # Expansions of (Main)Ft MainFt_rules = ['MainFt -> '+y for y in \ ['MainStressSyll', 'MainStressSyll Syll', 'Syll MainStressSyll']] Ft_rules = ['Ft -> '+y for y in \ ['StressSyll', 'StressSyll Syll', 'Syll StressSyll']] # Expansions of (Main)(Stress)Syll Syll_rules = ['MainStressSyll -> s1', 'StressSyll -> s2', 'Syll -> s0'] # Expansions of syllable preterminals Term_rules = ['s1 -> "σ"', 's2 -> "σ"', 's0 -> "σ"'] grammar_rules = PrWd_rules + MainFt_rules \ + Ft_rules + Syll_rules + Term_rules grammar_rulestr = '\n'.join(grammar_rules) grammar = CFG.fromstring(grammar_rulestr) print(f'# of productions in grammar: {len(grammar.productions())}') self.grammar = grammar self.parser = RecursiveDescentParser(grammar) def parses(self, inpt): T = [t for t in self.parser.parse(inpt.split())] T = [ParentedTree.convert(t) for t in T] return T
def test_sample(self): print("test_sample") # This is a CFG grammar, where: # Start Symbol : S # Nonterminal : NP,VP,DT,NN,VB # Terminal : "I", "a" ,"saw" ,"dog" grammar = nltk.grammar.CFG.fromstring(""" S -> NP VP NP -> DT NN | NN VP -> VB NP DT -> "a" NN -> "I" | "dog" VB -> "saw" """) sentence = "I saw a dog".split() parser = RecursiveDescentParser(grammar) final_tree = parser.parse(sentence) for i in final_tree: print(i)
def Theona(): intro1, intro2, intro3 = sentence_generation('open') audio_play('boost.wav') os.system(intro1) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) print('Training data... It will take 2-4 minutes.') chunker = ConsecutiveNPChunker(train_sents) os.system(intro2) # Theona Introduction audio_play('start_up.wav') os.system(intro3) # Step1. ASR # Use recognizer to record the speech. recorder = sr.Recognizer() starting = sentence_generation('hello') with sr.Microphone() as mike: print('Hello. Please speaking.') audio_play('pong.wav') os.system(starting) my_sound = recorder.listen(mike) print('Processing...') # Speech signal to text. Supported by google Speech api: Internet needs to be connected. tmp_words = recorder.recognize_google(my_sound) words = str(tmp_words) # test printing... print(words) # Step2. SLU # 1. find the specific places to users. #words = 'show me starbucks' # Tokenize the sentence. tokenized = word_tokenize(words) # Parsing the sentence to find out goal and entity clearly. pos_tagged = nltk.pos_tag(tokenized) chunk_words = chunker.parse(pos_tagged) reorder_words = tree_reconstruct(chunk_words) # Build the grammar for parsing. GOAL_FIND,ENTITY_PLACE = nonterminals('GOAL_FIND,ENTITY_PLACE') usr_goal = ENTITY_PLACE usr_find = GOAL_FIND VP,NP,O = nonterminals('VP,NP,O') grammar = CFG_grammar() rd_parser = RecursiveDescentParser(grammar) # Parsing the sentence. parsed_words = [] for parsing in rd_parser.parse(reorder_words): print(parsing) # Find GOAL and ENTITY for detect in parsing: if detect.label() == 'GOAL_FIND': usr_goal = detect.leaves()[0] if detect.label() == 'ENTITY_PLACE': usr_place = detect.leaves()[0] finding = sentence_generation('finding') finding = re.sub('<place>',usr_place,finding) audio_play('tone.wav') os.system(finding) # 2. Provide weather information to users. # Step3. DM # Collect information from the internet. # Location google_url = "https://www.google.co.kr/?gfe_rd=cr&ei=8YoTV-OdF8WL8AWGp5DgDg&gws_rd=ssl#newwindow=1&q=" daum_url = 'http://search.daum.net/search?w=tot&DA=YZR&t__nil_searchbox=btn&sug=&sugo=&sq=&o=&q=' # Connect to the internet to proceed the users' request: goal and entity. if usr_goal == 'find': # Searching in Daum. usr_request_url = daum_url + usr_place + '&tltm=1' request = requests.get(usr_request_url) soup = BeautifulSoup(request.content,'html.parser') # Searching in Google. #usr_request_url = google_url + usr_place #request = requests.get(usr_request_url) #soup = BeautifulSoup(request) # Collect information. # Find the closest 5 places around the location in which you start to request. all_data = soup.find_all('div',{'class','cont_place'}) first_data = all_data[0] # Address address_info = all_data[0].find_all('a',{'class','more_address'})[0].text # Phone Number phone_info = all_data[0].find_all('span',{'class','f_url'})[0].text # Location (map) map_info = all_data[0].find('a').get('href') # Weather # Step4. NLG # Generate an appropriate sentence. answer_text = NLG_transoformation('find') # Adjust the words if it is Korean. address_info = lang_adjust(address_info) # Substitude the markers to proper words answer_text = re.sub('<place>',usr_place,answer_text) answer_text = re.sub('<address>',address_info,answer_text) answer_text = re.sub('<phone>',phone_info,answer_text) # Step5. TTS audio_play('tone.wav') os.system('say ' + answer_text)
GOAL_FIND -> 'find' GOAL_FIND -> 'show' GOAL_FIND -> 'tell' O -> 'me' P -> 'in' ENTITY_PLACE -> 'starbucks' ENTITY_PLACE -> 'Starbucks' ENTITY_PLACE -> 'Coffee Bean' ENTITY_PLACE -> 'Coffeebean' """) rd_parser = RecursiveDescentParser(grammar) # Parsing the sentence. parsed_words = [] for parsing in rd_parser.parse(tokenized): print(parsing) # Find GOAL and ENTITY for detect in parsing: if detect.label() == 'GOAL_FIND': usr_goal = detect.leaves()[0] if detect.label() == 'ENTITY_PLACE': usr_place = detect.leaves()[0] finding = sentence_generation('finding') finding = re.sub('<place>',usr_place,finding) os.system(finding) # 2. Provide weather information to users.
N -> 'tree' N -> 'fish' Adj -> 'angry' Adj -> 'frightened' Adj -> 'little' Adj -> 'tall' V -> 'chased' V -> 'said' V -> 'thought' V -> 'was' V -> 'put' P -> 'on' """) # In[4]: rd = RecursiveDescentParser(grammar1) sentence1 = 'mary saw a telescope in the park'.split() for t in rd.parse(sentence1): print(t) t.draw() # In[ ]: #before executing this line restart the kernel and clear all outputs rd = RecursiveDescentParser(grammar2) sentence2 = 'the bear chased the frightened squirrel'.split() for s in rd.parse(sentence2): print(s) s.draw()
def parser(plain_text, set_name={ 'name': ['x', 'y', 'z'], 'iten': ['a', 'b', 'c'], 'def': ['m', 'n', 'o'] }): #first we define the cfg that will generate all possible sentences for the language # based on the names of set's and ite's name the user have choosen # formatting the functions names line_grammar = "MEM_FUNC -> " counter = 0 while counter < len(set_name['def']): if counter == 0: formated_newstring = " '{a}' ".format( a=set_name['def'][counter]) else: formated_newstring = " | '{a}' ".format( a=set_name['def'][counter]) line_grammar += formated_newstring counter += 1 # formatting for set names line_grammar_0 = "NAME -> " counter = 0 while counter < len(set_name['name']): if counter == 0: formated_newstring = " '{a}' ".format( a=set_name['name'][counter]) else: formated_newstring = " | '{a}' ".format( a=set_name['name'][counter]) line_grammar_0 += formated_newstring counter += 1 line_grammar_1 = "NAME_I -> " counter = 0 while counter < len(set_name['iten']): if counter == 0: formated_newstring = " '{a}' ".format( a=set_name['iten'][counter]) else: formated_newstring = " | '{a}' ".format( a=set_name['iten'][counter]) line_grammar_1 += formated_newstring counter += 1 prime_cloudy_grammar = (((""" T -> COM_D END | INIT_A COM_A END | 'start_cloud{' | '}end_cloud' COM_D -> 'name::=' NAME '{' ITEN ';' MEM ';' lacune_1 lacune_2 ATTR -> ITEN ';' MEM ';' ITEN -> 'iten::=' NAME_I MEM -> 'membership::=' '(' MEM_FUNC ')' lacune_3 INIT_A -> 'active=>' '{' COM_A -> NAME Q NAME | NAME_I O NAME |'plot=>' CONJ 'using:' PLOT_S PLOT_S -> 'line' | 'venn' CONJ -> NAME Q NAME | NAME Q -> '-u' | '-i' | '-c' O -> 'in' | 'out' | '<m>' END -> '}end' """.replace('lacune_1', line_grammar_0)).replace( 'lacune_2', line_grammar_1)).replace('lacune_3', line_grammar)) # using the nltk's tool to generate , we create the formal cfg _cloudy_grammar = CFG.fromstring(prime_cloudy_grammar) #for sentences_test in generate(_cloudy_grammar,n=200): #print(' '.join(sentences_test)) # then we create the parser for this grammar cloudy_rd = RecursiveDescentParser(_cloudy_grammar) # split the input text into lines code_total = plain_text.split('\n') counter = 0 while counter < len(code_total): test = code_total[counter].split() # all code must start and end with specific sample of code as follows if counter == 0 and 'start_cloud{' in test: print("starting parsing") pass elif counter != 0: pass else: return 'start_cloud statment not found' # the end cloud statment determines where the parser will stop parsing the code if "}end_cloud" in test: print('end of parsing') return 'end of parsing' else: pass try: parsed_check = [] for parsed in cloudy_rd.parse(test): parsed_check.append(parsed) # if the length of the list which contain the parsed sentences is equal to 0 then # it means that the sentence wasnt well, so there's a some syntax error if len(parsed_check) != 0: pass else: return 'Syntax error on: (' + str( code_total[counter]) + ' ) at line : ' + str(counter) except: # if some lexical component not allowed is used then the system can recognize it faster return 'Lexical error on : (' + str( code_total[counter]) + ') at line : ' + str(counter) counter += 1
def createTree2(self): grammar1 = self.makeGrammar() rd = RecursiveDescentParser(grammar1) for tree in rd.parse(self.myWords): print(tree)
from nltk.corpus import treebank from nltk import PCFG, CFG cfg_grammar = CFG.fromstring(""" S -> NP VP NP -> ART N | N N | N | NP PP VP -> V | V NP | V NP PP PP -> P NP ART -> 'a' N -> 'flower' | 'a' | 'blooms' V -> 'blooms' | 'flower' """) pcfg_grammar = PCFG.fromstring(""" S -> NP VP [1.0] NP -> ART N [0.53] | N N [0.09] | N [0.14] | NP PP [0.24] VP -> V [0.386] | V NP [0.393] | V NP PP [0.22] PP -> P NP [1.0] ART -> 'a' [1.0] N -> 'flower' [0.8] | 'a' [0.1] | 'blooms' [0.1] V -> 'blooms' [0.8] | 'flower' [0.2] """) from nltk.parse import RecursiveDescentParser print(cfg_grammar) rd = RecursiveDescentParser(pcfg_grammar) text = "a flower blooms".split() for t in rd.parse(text): print(t) #rd.draw()