def main(config): grammar_string = parse_induced_grammar( config.grammar ) if config.output: with open(config.output, 'w') as f: f.write(grammar_string) grammar = PCFG.fromstring( grammar_string ) grammar._start = Nonterminal('TOP') # Not sure whether this is allowed or breaks things # Create directory for parse_trees if it does not already exist if config.textfile: if not os.path.exists(config.output_parse): os.makedirs(config.output_parse) if config.textfile: parser = ViterbiParser(grammar) with open(config.textfile, 'r') as f: lines = f.read().splitlines() for i, line in enumerate(lines): if i==config.number_parses: break print(f"Parsing sentence {i+1}") sent = line.split() for t in parser.parse(sent): TreeView(t)._cframe.print_to_file(f"{config.output_parse}/tree_{i}")
def handle_syntax_parser_mode(tree, sem_rule_set): #print "Parse Tree: " #print tree if args.gui: TreeView(decorate_parse_tree(tree, sem_rule_set, set_productions_to_labels=True))
def parse(self, tokens: List[str]) -> List[str]: """ parse sentences in sent and print the parse tree :param tokens: tokens of a sentence :return all possible sentiment labels """ sentiment = [] for tree in self.cp.parse(tokens): if self.print: print(tree) if self.draw: tree.draw() if self.save: # save the tree diagram TreeView(tree)._cframe.print_to_file('saved_results/Tree' + str(self.tree_no) + '_diagram.ps') # save the tree text with open('saved_results/Tree' + str(self.tree_no) + '_text.txt', "w", encoding='utf-8') as writer: writer.write(str(tree)) # append the root's senti attribute value to the list senti_label = tree.label()['SENTI'] if senti_label == 'negative' or senti_label == 'positive': sentiment.append(senti_label) else: sentiment.append('neutral') self.tree_no += 1 return sentiment
def parse(self, tokens: List[str]) -> Tuple[list, Dict[str, List[str]]]: """ parse sentences in sent and print the parse tree :param tokens: tokens of a sentence :return all possible sentiment labels """ sentiment = [] parse_trees: Dict[str, List[str]] = defaultdict(list) # parse the sentence where S is the root for tree in self.cp_s.parse(tokens): if self.print: print(tree) if self.draw: tree.draw() if self.save: # save the tree diagram TreeView(tree)._cframe.print_to_file('saved_results/Tree' + str(self.tree_no) + '_diagram.ps') # save the tree text with open('saved_results/Tree' + str(self.tree_no) + '_text.txt', "w", encoding='utf-8') as writer: writer.write(str(tree)) # append the root's SENTI attribute value to the list senti_label = tree.label()['SENTI'] if senti_label in ['negative', 'positive', 'neutral']: sentiment.append(senti_label) parse_trees[senti_label].append(str(tree)) self.tree_no += 1 if len(sentiment) == 0: sentiment.append('unknown') parse_trees['unknown'].append('(unknown)') return sentiment, parse_trees
def save_image_from_tree(raw_tree,name,type='standfor'): file_path = STATICFILES_DIRS[0] + '/images/' + type + '/' complete_path = file_path + name static_path = '/static/images/' + type + '/' + name tree = Tree.fromstring(raw_tree) TreeView(tree)._cframe.print_to_file(complete_path + '.ps') os.system('convert %s.ps %s.png'% (complete_path,complete_path)) return static_path + '.png'
def get_graph(tokenized_word_list): tagged = nltk.pos_tag(tokenized_word_list) # grammar = r""" # NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN # PP: {<IN><NP>} # Chunk prepositions followed by NP # VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments # CLAUSE: {<NP><VP>} # Chunk NP, VP # """ grammar = r""" NP: {<DT|JJ>} # chunk determiners and adjectives }<[\.VI].*>+{ # chink any tag beginning with V, I, or . <.*>}{<DT> # split a chunk at a determiner <DT|JJ>{}<NN.*> # merge chunk ending with det/adj # with one starting with a noun """ cp = nltk.RegexpParser(grammar) results = cp.parse(tagged) for index, result in enumerate(results): if len(result) <= 2: del results[index] else: print(result) for index, result in enumerate(results): if len(result) <= 2: del results[index] else: print(result) for index, result in enumerate(results): if len(result) <= 2: del results[index] else: print(result) for index, result in enumerate(results): if len(result) <= 2: del results[index] else: print(result) for index, result in enumerate(results): if len(result) <= 2: del results[index] else: print(result) for index, result in enumerate(results): if len(result) <= 2: del results[index] else: print(result) TreeView(results)._cframe.print_to_file('output_2.ps') print("done")
def save_tree_png(tree, ouput): ''' 将Tree保存为png, 不支持中文字符 ''' print("save_tree_png", tree) t = Tree.fromstring(tree) ps = "%s.ps" % ouput TreeView(t)._cframe.print_to_file(ps) print(">> Generate Tree Image [%s], tree string [%s] ..." % (ouput, tree)) exec_cmd("convert %s %s" % (ps, ouput))
def write_to_file(tree, filename): if os.path.exists(f"{filename}.png"): os.remove(f"{filename}.png") import tempfile with tempfile.NamedTemporaryFile() as file: in_path = "{0:}.ps".format(file.name) TreeView(tree)._cframe.print_to_file(in_path) with Image.open(in_path) as img: img.load(scale=4) img.save(f"{filename}.png")
def syntax_analysis(sentence): syntax_model = StanfordCoreNLP( app_root + '/Model/stanford-corenlp-full-2018-10-05') tree = syntax_model.parse(sentence) syntax_model.close() # Do not forget to close! The backend server will consume a lot memery. t = Tree.fromstring(tree) random.randint(0, 999) TreeView(t)._cframe.print_to_file('output.ps') filename = 'output{}.png'.format(random.randint(0, 999)) command = 'convert output.ps images/' + filename os.system(command) return filename
def analyze_text(sent: Optional[str] = None): grammar_str = importCFG('backend/models/french_CFG.txt') moses = MosesTokenizer(lang='fr') grammar = nltk.CFG.fromstring(grammar_str.split('\n')) tagged_sent = tagger.tag(moses.tokenize(sent, escape=False)) tags = [token[1] for token in tagged_sent] parsed = [] image = '' try: parsed = parse(tags, grammar) from nltk.draw.tree import TreeView (x0, y0, w, h) = TreeView(parsed)._cframe.scrollregion() ps = TreeView(parsed)._cframe._canvas.postscript( x=x0, y=y0, width=w + 2, height=h + 2, pagewidth=w + 2, # points = 1/72 inch pageheight=h + 2, # points = 1/72 inch pagex=0, pagey=0, colormode='color') ps = ps.replace(" 0 scalefont ", " 9 scalefont ") img = Image.open(io.BytesIO(ps.encode('utf-8'))) img.load(scale=5) image_out = io.BytesIO() img.save(image_out, format="png") # image_out.seek(0) image = base64.b64encode(image_out.getvalue()).decode() except Exception as e: logging.exception("An exception was thrown!") return { "sentence": sent, "tagged": tagged_sent, "parsed": str(parsed), "image": image }
def parse(self, tokens: List[str]) -> None: """ parse sentences in sent and print the parse tree :param sentences: sentences """ for tree in self.cp.parse(tokens): print(tree) # print the tree if self.print: tree.draw() # display the tree diagram if self.save: TreeView(tree)._cframe.print_to_file('results/output' + str(self.tree_no) + '.ps') self.tree_no += 1
def draw(self, img_outdir=None): """ Use NLTK method help visualize the dependency * Example: from nltk.tree import Tree demo = '(ROOT(IP(NP(PN 我))(VP (VV 叫) (NP (NN 小米)))))' Tree.fromstring(a).draw() """ # to save the tree into image, see at: https://stackoverflow.com/questions/23429117/saving-nltk-drawn-parse-tree-to-image-file if self.dependencyString: tc = Tree.fromstring(self.dependencyString) if not img_outdir: img_outdir = "/" # output the image: # REMAIN: display of Chinese characters: if img_outdir: if not img_outdir.endswith('.ps'): TreeView(tc)._cframe.print_to_file( img_outdir + "default_dependencyTree.ps") sys_name = img_outdir + "default_dependencyTree.ps" else: TreeView(tc)._cframe.print_to_file(img_outdir) sys_name = img_outdir # end if # rebuild the file name and output: convert_name = "%s.png" % (sys_name.split(".")[0]) out = os.system('convert %s %s' % (sys_name, convert_name)) if out > 0: logger.error( "[Dependency Draw] install ImageMagicK or the path is invaild. " ) else: logger.error("[Dependency Parser] output tree image at [%s] " % convert_name) # return tc tc.pretty_print()
def test(): STOLJG = """ S -> DAN '.' MJESEC GODINA'.' ZNAMENKA -> '1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'|'9' ZNAMENKA0 -> ZNAMENKA | '0' DAN -> ZNAMENKA | '1' ZNAMENKA0 | '2' ZNAMENKA0 | '30' | '31' MJESEC -> 'siječnja' | 'veljače' | 'ožujka' | 'travnja' | 'svibnja' | 'lipnja' | 'srpnja' | 'kolovoza' | 'listopada' | 'studenog' | 'prosinca' GODINA -> ZNAMENKA | ZNAMENKA ZNAMENKA0 | ZNAMENKA ZNAMENKA0 ZNAMENKA0 | ZNAMENKA ZNAMENKA0 ZNAMENKA0 ZNAMENKA0 """ grammar = CFG.fromstring(STOLJG) parser = RecursiveDescentParser(grammar) sent = '2 1 . siječnja 1 9 0 1 .'.split() print(sent) for tree in parser.parse(sent): print(tree) tree.draw() TreeView(tree)._cframe.print_to_file('output.ps')
def parse(self, tokens: List[str]) -> None: """ parse sentences in sent and print the parse tree :param sentences: sentences """ for tree in self.cp.parse(tokens): print(tree) # print the tree if self.print: tree.draw() # display the tree diagram if self.save: # save the tree diagram TreeView(tree)._cframe.print_to_file('results/Tree' + str(self.tree_no) + '_diagram' + '.ps') # save the tree text with open('results/Tree' + str(self.tree_no) + '_text' + '.txt', "w", encoding='utf-8') as writer: writer.write(str(tree)) self.tree_no += 1
def tweet(request, id): import nltk from senti_classifier import senti_classifier t = TwitterPost.objects.all().filter(id=id).first() sentence = t.text tokens = nltk.word_tokenize(sentence) pos_score, neg_score = senti_classifier.polarity_scores([t.text]) tagged = nltk.pos_tag(tokens) import os from nltk.tree import Tree from nltk.draw.tree import TreeView tr = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP pretty)))') TreeView(tr)._cframe.print_to_file('output.ps') os.system('convert output.ps output.png') os.system('cp output.png static/tree.png') entities = nltk.chunk.ne_chunk(tagged) data = {"twitt": t, "tokens": tokens, "tags": entities, "pos_score": pos_score, "neg_score": neg_score} return render(request, "tweet.html", data)
def speechTrees(text): sample_text = text custTokenizer = PunktSentenceTokenizer(text) tokenized = custTokenizer.tokenize(sample_text) try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) chunkGram = """ S: {<NP><VP>} NP: {<DT>?<JJ>*<NN>} {<DT>?<JJ>*<PRP>} {<DT>?<JJ>*<NNP>} {<DT>?<JJ>*<NNS>} PP: {<P><NP>} VP: {<VB><NP>|<VP><PP>} IF: {<TO><VB>} P: {<IN><NP>} """ chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) tree = TreeView(chunked) except Exception as e: print(str(e))
def print_tree(t, file_name): from nltk.draw.tree import TreeView tv = TreeView(t) tv._cframe.print_to_file(file_name)
input('Look at the following code for dependency parsing.\n') toy_dep_grammar = nltk.DependencyGrammar.fromstring(""" 'shot' -> 'I' | 'elephant' | 'in' 'elephant' -> 'an' | 'in' 'in' -> 'pajamas' 'pajamas' -> 'my' """) pdp = nltk.ProjectiveDependencyParser(toy_dep_grammar) sent = 'I shot an elephant in my pajamas'.split() print(f'parsing {sent.__repr__}...') ptrees = pdp.parse(sent) for i, tree in enumerate(ptrees): print(tree, tree.height()) TreeView(tree)._cframe.print_to_file(f'dep_tree{i}.ps') print('to convert images, run...\n $ convert tree0.ps tree0.png') input('[enter] to continue.\n') print('\n' + '#' * 79) print('''Constituency parsing Some theories of syntax assume that there are implicit structures to annotate, such as Noun Phrases (NP), Verb Phrases (VP), etc. ''') # CFG = Context-Free Grammar toy_cfg_grammar = nltk.CFG.fromstring(""" S -> NP VP
from nltk import RecursiveDescentParser, CFG, pos_tag, word_tokenize from nltk.draw.tree import TreeView from os import system, remove rdparser = RecursiveDescentParser(CFG.fromstring("""S -> NP VP PP -> P | P NP | P VP NP -> Det NP PP1 | Adj N PP1 | N PP1 | N NP PP1 PP1 -> PP PP1 | VP -> V NP PP1 | V PP1 Det -> 'DT' N -> 'NN' | 'NNS' | 'NNPS' | 'NNP' | 'PRP' | 'PRP$' V -> 'VBZ' | 'VBD' | 'VBP' | 'VBG' Adj -> 'JJ' P -> 'IN'""")) taggedsent = pos_tag(word_tokenize(''.join(c for c in input('Enter a sentence:') if c not in ':,;."'))) j = 1 for tree in rdparser.parse([x[1] for x in taggedsent]): i = iter(taggedsent) for s in tree.subtrees(): if len(s) == 1: s[0] = next(i)[0] tv = TreeView(tree) tv._size.set(18) tv.resize() tv._cframe.canvas()['scrollregion'] = (0, 0, 1000,500) tv._cframe.print_to_file('output'+str(j)+'.ps') if system('convert output'+str(j)+'.ps -alpha off output'+str(j)+'.png') != 0: print(tree) remove('output'+str(j)+'.ps') j += 1
# Named Entity Recognition (NER) text = '''In August, Samsung lost a US patent case to Apple and was ordered to pay its rival $1.05bn (£0.66bn) in damages for copying features of the iPad and iPhone in its Galaxy range of devices. Samsung, which is the world's top mobile phone maker, is appealing the ruling. A similar case in the UK found in Samsung's favour and ordered Apple to publish an apology making clear that the South Korean firm had not copied its iPad when designing its own devices.''' sentences = nltk.sent_tokenize(text) for sentence in sentences: tokens = nltk.word_tokenize(sentence) tokens_pos_tagged = nltk.pos_tag(tokens) tokens_pos_tagged_and_named_entities = ne_chunk(tokens_pos_tagged) print() print('ORIGINAL SENTENCE', sentence) print('NAMED ENTITY RECOGNITION OUTPUT', tokens_pos_tagged_and_named_entities) # Constituency/dependency parsing constituent_parser = nltk.RegexpParser(''' NP: {<DT>? <JJ>* <NN>*} # NP P: {<IN>} # Preposition V: {<V.*>} # Verb PP: {<P> <NP>} # PP -> P NP VP: {<V> <NP|PP>*} # VP -> V (NP|PP)*''') tokens = ['In', 'the', 'house', 'the', 'yellow', 'cat', 'saw', 'the', 'dog'] tagged = nltk.pos_tag(tokens) print(tagged) constituent_structure = constituent_parser.parse(tagged) print(constituent_structure) constituent_structure # Save tree to file TreeView(constituent_structure)._cframe.print_to_file('output.pdf')
print "loading data.." data = loadData('parseTrees.txt') print "generating trees.." treeData = getTreeData(data) print "gathering rules" for t in treeData: rules.extend(t.productions()) print("Number of rules: " + str(len(rules))) print "constructing PCFG" S = Nonterminal('S') grammar = induce_pcfg(S, rules) print "PCFG:" print(grammar) sentense = "show me the meals on the flight from Phoenix".split() print "parsing with Viterbi parser..." viterbi_parser = nltk.ViterbiParser(grammar) # viterbi_parser.trace(3) for tree in viterbi_parser.parse(sentense): print(tree) print "parsing with InsideChart parser..." inside_parser = nltk.InsideChartParser(grammar) viterbi_parser.trace(3) idx = 0 for tree in inside_parser.parse(sentense): print(tree) TreeView(tree)._cframe.print_to_file('output' + str(idx) + '.ps') idx = idx + 1 print "done!"
log_path = "/Users/Tony/Documents/intellIJWorkSpace/HRL-RavenClawJava/log/sessions/11001D2016-06-03T22-05-21.log" # log_path = "/Users/Tony/Documents/intellIJWorkSpace/HRL-RavenClawJava/log/sessions/11001D2016-06-04T00-26-55.log" log_path = "data/11001D2016-06-04T23-21-10.log" reader.parse_session_log(log_path) sys_str_tree = reader.cur_log.get('parseTree') sys_tree = nltk.Tree.fromstring(sys_str_tree) terminals = sys_tree.leaves() reader.print_turns() p.inc_parse(x0) p.print_last_chart() p.inc_parse(x1) p.print_last_chart() p.inc_parse(x2) p.print_last_chart() p.parse(terminals) p_trees = p.get_parses(in_string=False) exit() if p_trees is not None: TreeView(sys_tree)._cframe.print_to_file('imgs/original.ps') print "Found " + str(len(p_trees)) + " trees." for idx, tree in enumerate(p_trees): TreeView(tree)._cframe.print_to_file('imgs/' + str(idx) + '.ps') break else: print "No parse found!" p.print_chart(with_parse=False)
lines = sent_tokenize(file_content) lines = [line for line in lines if line != '' and len(word_tokenize(line)) <= 10 and line[-1] in '.?!' and line[0].isupper()] print(len(lines)) wrong_lines_count = 0 pic_count = 0 for i, line in enumerate(lines): if wrong_lines_count == 5: break print('Original line: ' + line) tree = next(parser.raw_parse(line)) if pic_count < 5 and word_tokenize(line) == 10: filename = get_valid_filename(line) TreeView(tree)._cframe.print_to_file(filename + '.ps') pic_count += 1 errors = ATD.checkDocument(line) if len(list(errors)) == 0: print('**No errors** ({}/{})'.format(i + 1, len(lines))) continue else: print() correct_line = correct(line, errors) tree.pretty_print() print('Correct line: ' + correct_line) correct_tree = next(parser.raw_parse(correct_line)) correct_tree.pretty_print() wrong_lines_count += 1 print('Number of wrong sentences: {}'.format(wrong_lines_count))
from nltk.draw.tree import TreeView from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger import os merger = PdfFileMerger() inputdir = 'shiftreduce-parsed' outputdir = 'visualized/shiftreduce' #treeee=nltk.tree.Tree.fromstring('(ROOT (S (S (INTJ (UH Yes)) (, ,) (NP (NP (DT the) (JJ deceased)) (, ,) (CC and) (NP (QP (CD three) (CC or) (CD four)) (JJR more))) (VP (VBD followed) (NP (NNP Mr.) (NNP Bamber)))) (: ;) (CC but) (S (NP (PRP they)) (VP (VBD took) (NP (DT no) (NN notice)) (PP (IN at) (NP (NP (DT all)) (PP (IN of) (NP (DT any) (NN body))))))) (. .)))') #TreeView(treeee)._cframe.print_to_file('bla.pdf') for folder in os.listdir(inputdir): newpath = os.path.join(inputdir, folder) newoutputpath = os.path.join(outputdir, folder) os.makedirs(newoutputpath) for file in os.listdir(newpath): content = open(newpath + '/' + file) id = 1 for line in content.readlines(): #if 'parse' in line and 'metadata' not in line and 'JJR <' not in line: #parse=line[10:] parse = line parse_tree = nltk.tree.Tree.fromstring(parse) #temppath=os.path.join(newoutputpath, 'sentence'+str(id)) #print(temppath) name = newoutputpath + '/sentence' + str(id) + '.pdf' TreeView(parse_tree)._cframe.print_to_file(name) merger.append(name) id += 1 merger.write(newoutputpath + '/' + file)
sentenceList = sentence if isinstance(sentence, str): sentenceList = sentence.split(' ') print('Original sentence: ' + ' '.join(sentenceList)) printParses(allParses(sentenceList)) def mainScript(): #Preprocessing Sentence 22 processSentence( 'In July the Environmental Protection Agency imposed a gradual ban on virtually all uses of asbestos' ) #Preprocessing Sentence 7 processSentence('There is no asbestos in our products now') #Preprocessing Sentence 13 processSentence('The top money funds are currently yielding well over 9 %') mainScript() #Drawing the CFG trees for the given sentences TreeView(sentence22)._cframe.print_to_file('s22.ps') sentence22_after_adding_rules_in_grammar = Tree.fromstring( '(S(PP-TMP (IN In) (NP (NNP July)))(, ,)(NP-SBJ (DT the) (NNP Environmental) (NNP Protection) (NNP Agency))(VP(VBD imposed)(NP(NP (DT a) (JJ gradual) (NN ban))(PP (IN on) (NP (ADJP (RB virtually) (DT all)) (NNS uses))))(PP-CLR (IN of) (NP (NN asbestos))))(. .))' ) TreeView(sentence22_after_adding_rules_in_grammar)._cframe.print_to_file( 's22b.ps') TreeView(sentence7)._cframe.print_to_file('s7.ps') TreeView(sentence13)._cframe.print_to_file('s13.ps')
from nltk.tree import Tree from nltk.draw.tree import TreeView from string_tree import * import os t = Tree.fromstring(ruleString) TreeView(t)._cframe.print_to_file('output.ps') os.system( 'C:/"Program Files"/ImageMagick-7.0.7-Q16/magick.exe convert output.ps output.png' )
del work_list_copy[i + 1:i + 3] work_list = work_list_copy tree_list = tree_list_copy else: i += 1 # look for o_bracket Formula connective Formula connective c_bracket i = 0 while i < len(work_list): if work_list[i] == 'o_bracket' and work_list[ i + 1] == 'Formula' and work_list[ i + 2] == 'connective' and work_list[ i + 3] == 'Formula' and work_list[i + 4] == 'c_bracket': aux_tree = Tree('Formula', tree_list[i:i + 5]) tree_list_copy[i] = aux_tree work_list_copy[i] = 'Formula' del tree_list_copy[i + 1:i + 5] del work_list_copy[i + 1:i + 5] work_list = work_list_copy tree_list = tree_list_copy else: i += 1 parseTree = Tree('Start', tree_list) TreeView(parseTree)._cframe.print_to_file('ParseTree.ps')