def main(): parser = StanfordParser( path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar) st = StanfordNERTagger( model_filename= '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution." sent = word_tokenize(raw_sent) ne_tuple = st.cur_tag( sent ) # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized) print ne_tuple print parser.raw_parse(raw_sent).next() return # find name entity f = 0 ne_list = [] for (ne, label) in ne_tuple: if label == 'PERSON': f = 1 if f and label != 'PERSON': break if f: ne_list.append(ne) # print ne_list init_file(main_tree) ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ? # try head to ask who/what pattern = "S < NP=np" head = check_output([ 'bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file ]) print head def get_main_verbs(tree): pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)' main_verbs = check_output([ 'bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file ]) print main_verbs main_verbs = main_verbs.split('\n')[:-1] main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs] return main_verbs
class Parser(object): """ A natural language parser is a program that works out the grammatical structure of sentences, for instance, which groups of words go together (as “phrases”) and which words are the subject or object of a verb. Probabilistic parsers use knowledge of language gained from hand-parsed sentences to try to produce the most likely analysis of new sentences. These statistical parsers still make some mistakes, but commonly work rather well. Their development was one of the biggest breakthroughs in natural language processing in the 1990s. """ def __init__(self, model_path, path_to_jar, path_to_models_jar): # nltk package from nltk.parse.stanford import StanfordParser self.__model_path = model_path self.__path_to_jar = path_to_jar self.__path_to_model_jar = path_to_models_jar self.__stf_parser = StanfordParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar, model_path=model_path, encoding='utf-8') def parse_sentence(self, text): """ Arguments: text -- input text string to be parsed Returns: list of the parsed result in the form (parent_tag(tag, word)) """ self.__text = text return list(self.__stf_parser.raw_parse(text)) def tree_print(self): """ Arguments: -- None Returns: -- None """ for line in self.__stf_parser.raw_parse(self.__text): for sentence in line: print(sentence) def tree_draw(self): """ Arguments: -- None Returns: -- None """ for line in self.__stf_parser.raw_parse(self.__text): for sentence in line: sentence.draw()
def sdfprocess(rvdata): parser = StanfordParser( path_to_jar= '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar= '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx15000m') sdfdata = [] cnn = 0 widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' ', FileTransferSpeed() ] pbar = ProgressBar(widgets=widgets, maxval=len(rvdata)).start() for eg in rvdata: # if cnn%100 == 0: print "%f%% of document %d finished" % (cnn*100*1.0/len(rvdata), partidx+1) cmt = eg[3].decode('utf-8') #3 is the idx of comment sentences = nltk.sent_tokenize(cmt) parsedls = [] for snt in sentences: sntparsed = parser.raw_parse(snt) parsedls.append(sntparsed) sdfdata.append(eg[:3] + [parsedls]) # print cnn # print sdfparsed # print sdfdata # if cnn > 5: break pbar.update(cnn + 1) cnn += 1 pbar.finish() return sdfdata
def main(): """Main function of script.""" args = utils.read_arguments(__doc__) # Read dataset. Each row of x_matrix is a sentence. x_matrix, y_vector = utils.pickle_from_file(args['input_filename']) # Get Stanford model parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8') # Get parse trees. parsed_matrix = [] for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)): parsed_document = [] for paragraph_index, paragraph in enumerate(document): parsed_paragraph = [] for sentence_index, sentence in enumerate(paragraph): try: parsed_paragraph.append( list( parser.raw_parse( six.text_type(sentence.decode('utf-8'))))) except UnicodeDecodeError: logging.warning( 'Skip sentence {}-{}-{} for unicode error'.format( index, paragraph_index, sentence_index)) y_vector[index].pop(sentence_index) parsed_document.append(parsed_paragraph) parsed_matrix.append(parsed_document) # Save output logging.info('Saving {} documents'.format(len(parsed_matrix))) utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename']) logging.info('All operations finished')
def cStructure(): print '######## C Structure' parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) example = parser.raw_parse("Who were the CEO of IBM?") # example = parser.raw_parse("Steve Jobs was Founder of Apple. He was born in United States of America.") #for line in example: #for sentence in line: #sentence.draw() #print type(example) example = list(example) #print example abcabc = example[0] abcabc1 = abcabc[0] print type(abcabc) hello = str(abcabc) print type(abcabc) print hello #print abcabc1.label() for a in abcabc: #print a.height() if a.height() > 1: extractNP(a) print myNounPhrasesTree
def check(sent): parser = StanfordParser() # Parse the example sentence print(sent) t = list(parser.raw_parse(sent))[0] print(t) t = ParentedTree.convert(t) print(t) t.pretty_print() try: subj = find_subject(t) except: subj = [] try: pred = find_predicate(t) except: pred = [] try: obj = find_object(t) except: obj = [] print(subj) print(pred) print(obj) return subj, pred, obj
def ConstituencyParser(sentence): from nltk.parse.stanford import StanfordParser # create parser object scp = StanfordParser(path_to_jar='/path/to/stanford-parser.jar', path_to_models_jar='path/to/stanford-parser-models.jar') # get parse tree result = list(scp.raw_parse(sentence))
def extract_h4_parser(self, sentence): list = [] parser = StanfordParser(model_path="E:/Stanford parser/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") t = parser.raw_parse(str(sentence)) for i in t: for j in i.subtrees(lambda i: i.height() == 4): list.append(str(j)) return list
def main(): parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar) st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution." sent = word_tokenize(raw_sent) ne_tuple = st.cur_tag(sent) # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized) print ne_tuple print parser.raw_parse(raw_sent).next() return # find name entity f = 0 ne_list = [] for (ne, label) in ne_tuple: if label == 'PERSON': f = 1 if f and label != 'PERSON': break if f: ne_list.append(ne) # print ne_list init_file(main_tree) ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ? # try head to ask who/what pattern = "S < NP=np" head = check_output(['bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file]) print head def get_main_verbs(tree): pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)' main_verbs = check_output(['bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file]) print main_verbs main_verbs = main_verbs.split('\n')[:-1] main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs] return main_verbs
def parseSentence(inputSentence): parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") parsedSentence = parser.raw_parse(inputSentence) sent = printSentence(parsedSentence) ret = str(sent).replace("\n", "").replace(' ', "").replace("(", "{").replace( ")", "}").replace(" {", "{") return ret
def cn_parse(sent): """ 对中文句子做句法分析,记得model_path要改变 """ parser = StanfordParser( 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar', 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') return list(parser.raw_parse(sent))[0]
def en_parse(sent): """ 对英文句子做句法分析 """ parser = StanfordParser( 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar', 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar' ) return list(parser.raw_parse(sent))[0]
def parseThisSent(sentences): parser = StanfordParser(path_to_models_jar=my_path_to_models_jar3, path_to_jar=my_path_to_jar3) c = list(parser.raw_parse(sentences)) B = c[0].copy() #get all NPs and VPs phrasesList = [] for s in B.subtrees(lambda B: B.label() == 'NP' or B.label() == 'VP'): phrasesList += [(" ".join(s.leaves()))] return phrasesList
class SyntaxTreeParser: def __init__(self): self.parser = StanfordParser() if not self.parser: raise RuntimeError('Stanford Parsre could not be initialized.') def raw_parse(self, sent): tree = next(self.parser.raw_parse(sent)) return tree def parse(self, sent): one_sent = sent if len(sent[0]) == 1: one_sent = nltk.pos_tag(sent) tree = self.parser.tagged_parse(one_sent) return tree
def parse(text, normalize=True) : #ToDo: change behavior """Parses string, iterable of strings or nested iterables of strings""" # saves stanford_parser as global variable, # such that it is not recreated everytime parse is executed if not 'stanford_parser' in globals() : global stanford_parser stanford_parser = StanfordParser(conf.stanford_parser,conf.stanford_models) if hasattr(text, '__iter__') : return [parse(t) for t in text] else : if normalize: text = canonicalize(text) trees = stanford_parser.raw_parse(text) return trees
class Parser(object): """ Parse sentence structure """ def __init__(self, jar_path, model_path): self.parser = StanfordParser(jar_path, model_path) self.dep_parser = StanfordDependencyParser(jar_path, model_path) def __call__(self, doc): doc['parse'] = ParentedTree.convert(self.parse(doc['text'])) doc['dep_parse'] = self.dep_parse(doc['text']) def parse(self, statement): return next(self.parser.raw_parse(statement)) def dep_parse(self, statement): return next(self.dep_parser.raw_parse(statement))
def average_parse_tree_height(doc): remove_url = clean_formula(doc) parser=StanfordParser() sentence = remove_url.replace(';','.').replace('?','.').replace('!','.').split('.') sentence = [item for item in sentence if item] sentence = filter(operator.methodcaller('strip'), sentence) depth = lambda L: isinstance(L, list) and (max(map(depth, L)) + 1) if L else 1 total_level = 0 total_count = 0 for s in sentence: if len(s.split())< 20: total_level += depth(list(parser.raw_parse(s))) total_count += 1 if total_count >0: average = total_level / total_count else: average = 0 return average
class OldStanfordLibParser(Parser): """For StanfordParser < 3.6.0""" def __init__(self): self.parser = StanfordParser() def parse(self, line): """Returns tree objects from a sentence Args: line: Sentence to be parsed into a tree Returns: Tree object representing parsed sentence None if parse fails """ tree = list(self.parser.raw_parse(line))[0] tree = tree[0] return tree
class OldStanfordLibParser(Parser): """For StanfordParser < 3.6.0""" def __init__(self): self.parser = StanfordParser() def parse(self, line): """Returns tree objects from a sentence Args: line: Sentence to be parsed into a tree Returns: Tree object representing parsed sentence """ tree = list(self.parser.raw_parse(line))[0] tree = tree[0] return tree
def analyzing_sentence_structure(sentence): scp = StanfordParser(path_to_jar=LoadCommonSense.STF_PATH + "stanford-parser.jar", path_to_models_jar=LoadCommonSense.STF_PATH + "stanford-parser-4.2.0-models.jar") sentence = " ".join(clean_text(sentence, False)) try: result = list(scp.raw_parse(sentence)) except BaseException: return {"subject": "", "verb": "", "object": ""}, sentence, 1 tree_result = result[0].subtrees() sentence_json = {"subject": "", "verb": "", "object": ""} temp_object = [] for each in tree_result: flag = 0 tree_label = each.label() find_verb = re.findall(r"VB.*", tree_label) if tree_label == "NP" and flag != 1: flag += 1 sentence_json["subject"] = each.leaves() elif find_verb: sentence_json["verb"] = each.leaves() break elif tree_label == 'ROOT': temp_object = each.leaves() if not sentence_json["subject"]: sentence_json["subject"] = ["i"] temp_object = sentence_json["subject"] + temp_object sentence_new = "i " + sentence else: sentence_new = sentence error = 0 try: sentence_json["object"] = [ word for word in temp_object if word not in (sentence_json["subject"] + sentence_json["verb"]) ] except BaseException: error = 1 return sentence_json, sentence_new, error
def get_parse_tree(df): print('\ntraversing phrase...') # path setting for stanford parser # versi windows # java_path = r'C:\Program Files\Java\jdk1.8.0_151\bin' # os.environ['JAVAHOME'] = java_path # stanford_parser = StanfordParser(path_to_jar='c:/stanford-parser-full/stanford-parser.jar', # path_to_models_jar='c:/stanford-parser-full/stanford-parser-3.5.2-models.jar') # versi linux java_path = r'/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin' os.environ['JAVAHOME'] = java_path stanford_parser = StanfordParser( path_to_jar= '/home/akunaefi/PycharmProjects/StanfordParser/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar= '/home/akunaefi/PycharmProjects/StanfordParser/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar' ) for index, words in enumerate(df['lemmatized_review']): # list_of_frases = [] print('--------------------- ', index) print('review: ', words) # sent = text_cleaner(words) # df_clause.loc[index,'Review'] = sent parsed_sent = stanford_parser.raw_parse(words) tree = parsed_sent.__next__() list_of_nps = traverse_tree(tree, 'NP') num_of_np = len(list_of_nps) print('NOUN = ', num_of_np) list_of_vps = traverse_tree(tree, 'VP') num_of_vp = len(list_of_vps) print('VERB = ', num_of_vp) list_of_mds = traverse_tree(tree, 'MD') num_of_md = len(list_of_mds) print('MODAL = ', num_of_md) df.loc[index, 'num_np'] = num_of_np df.loc[index, 'num_vp'] = num_of_vp df.loc[index, 'num_md'] = num_of_md return df.copy()
def average_number_of_subordinate_clauses_per_sentence(doc): remove_url = clean_formula(doc) parser=StanfordParser() sentence = remove_url.replace(';','.').replace('?','.').replace('!','.').split('.') sentence = [item for item in sentence if item] sentence = filter(operator.methodcaller('strip'), sentence) subtexts = [] total_count = 0 for s in sentence: if len(s.split())< 20: t = list(parser.raw_parse(s))[0] total_count += 1 for subtree in t.subtrees(): if subtree.label()=="S" or subtree.label()=="SBAR": subtexts.append(' '.join(subtree.leaves())) if total_count > 0: average = len(subtexts) / total_count else: average = 0 return average
class Stanford: def __init__(self): """ The Stanford Parser is required, download from http://nlp.stanford.edu/software/lex-parser.shtml and unpack somewhere """ # insert path to java home if os.name == "nt": os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk1.8.0_66\bin\java.exe' # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar', 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') elif os.name != "posix": os.environ['JAVAHOME'] = 'C:/Program Files (x86)/Java/jdk1.8.0_65/bin/java.exe' # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar', 'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') else: os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.8.0-openjdk-amd64' # insert path to the directory containing stanford-parser.ja and stanford-parser-3.5.2-models.jar self.english_parser = StanfordParser( expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser.jar', expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') def get_sent_depth(self, s): # remove linebreaks for syntax tree s = s.replace('\n', ' ').replace('\r', ' ') sentence = self.english_parser.raw_parse(s) current_tree = None depth = 0 for line in sentence: current_tree = line depth = line.height() - 1 sent_depth_feature_value = (depth - 4) / 20 if sent_depth_feature_value < 0: return current_tree, 0 if sent_depth_feature_value > 1: return current_tree, 1 return current_tree, round(sent_depth_feature_value, 2)
def cStructure(user_input): # print '######## C Structure ########' parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) example = parser.raw_parse(user_input) example = list(example) #print example getTree = example[0] # print getTree getTreeTwo = getTree[0] # print getTreeTwo # print type(getTree) treeToString = str(getTree) # print type(treeToString) # print treeToString #print abcabc1.label() for element in getTree: #print a.height() if element.height() > 1: extractPhrases(element)
class Parser(object): def __init__(self, jar_path, model_path): self.parser = StanfordParser(jar_path, model_path) self.dep_parser = StanfordDependencyParser(jar_path, model_path) def __call__(self, doc): doc['parse'] = ParentedTree.convert(self.parse(doc['text'])) doc['dep_parse'] = self.dep_parse(doc['text']) def parse(self, statement): return next(self.parser.raw_parse(statement)) # (raw_parse) Use StanfordParser to parse a sentence. Takes a sentence as a string; # before parsing, it will be automatically tokenized and tagged by # the Stanford Parser. def dep_parse(self, statement): return next(self.dep_parser.raw_parse(statement))
class Parser: def __init__(self): self.stanford_parser = StanfordParser(model_path=MODEL_PATH) def fill_in_the_blank(self, text): parse_tree = list(self.stanford_parser.raw_parse(text))[0] ans_list = self.leaves(parse_tree) with_blanks = text for ans in ans_list: for word in ans: with_blanks = with_blanks.replace(word, "_" * len(word)) print with_blanks return (with_blanks, ans_list) def leaves(self, tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" answers = [] for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): nnp_exists = list( subtree.subtrees(filter=lambda t: t.label() == 'NNP')) if nnp_exists: answers.append(subtree.leaves()) return answers
def stanfordParser(): # ajout des variables d'environnement os.environ['CLASSPATH'] = "stanford-parser/stanford-parser-full-2018-10-17" os.environ['JAVAHOME'] = "D:/Program Files/java/bin" # chemin du parser Stanford Parser parser = StanfordParser(model_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # chemin de l'input / output path_input = "ressources/TBAQ-txt-annot-StanfordParser/TimeBank-txt-annot/TimeBank/" path_output = 'ressources/TBAQ-txt-annot-StanfordParser/TimeBank-txt-annot/TimeBank_StanfordParser/' for filename in os.listdir(path_input): print(filename) file = codecs.open(path_input+filename, 'r', 'utf8').read() # pour remplacer les retours à la ligne en debut de fichier -> empechaient le tagging des phrases file = file.lstrip().replace('\r\n\r\n\r\n', ' ').replace('\r\n\r\n', ' ').replace('\r\n', ' ') # tokenization en phrase sents = nltk.sent_tokenize(file) # chaine vide parsedText = "" # pour chaque phrase on passe le stanford parser for sent in sents: constituancies = list(parser.raw_parse(sent)) # print(constituancies) # regex pour mettre en forme la sortie pour que le format convienne à addDiscourse constituancies = re.sub(r'''(\[)(Tree)(\()('ROOT')(\,)( )(\[)(Tree)''', '(S1 (S ', str(constituancies)) constituancies = re.sub(r'''(((\,)( )(\[)(Tree))|((\,)( )(Tree)))''', ' ', str(constituancies)) constituancies = re.sub(r'''(\[)((')|("))|((')|("))(\])''', "'", str(constituancies)) constituancies = re.sub(r'''(\])''', ')', str(constituancies)) constituancies = re.sub(r'''(')''', '', str(constituancies)) # ajout des phrases tagguées dans la chaine vide parsedText += str(constituancies) # ouverture des fichiers with open((path_output+filename), 'w', encoding='utf8') as fileW: # écriture des fichiers fileW.write(parsedText)
sentence = line.strip() # print sentence else: sentence = sentence + " " + line.strip() l= words[-1] # print 'awesome' in 'wheather is awesome here dude' # print l edus.append(line.strip()) # print line.strip() # print ".\"" # print line.strip()[-2:] if line.strip()[-1]=="." or line.strip()[-2:]==".\"": # print sentence # print edus rootWord = _parse_output(english_parser.raw_parse(sentence),edus,dep) # print 'end' sentence =None edus = deque() # for sentence in sentences: # rootWord = _parse_output(english_parser.raw_parse(sentence)) # dep.write(str(sentence).split()) # dep.write("@#%^&*") # dep.write(str(rootWord)) # dep.write("\n") # print i # i=i+1 if sentence!=None: rootWord = _parse_output(english_parser.raw_parse(sentence),edus,dep) # print 'end' sentence =None
class NLQueryEngine(LoggingInterface): """ Grammar mapping for knowledge queries of the form: - What is the X of Y - What is X's Y """ def __init__(self, properties={'lang': 'en'}): LoggingInterface.__init__(self) self.parser = StanfordParser( model_path=MODELS_PATHS[properties['lang']]) self.wd = WikiData() self.wd.set_properties(properties) self.properties = properties def subject_query(self, qtype, subject, action, jj=None, prop=None, prop2=None, prop3=None): """Transforms matched context into query parameters and performs query Args: qtype: Matched type of query (what, who, where, etc.) subject: Matched subject (Obama) action: Matched verb action (is, was, ran) jj (optional): Matched adverb prop (optional): Matched prop prop2 (optional): Matched prop prop3 (optional): Matched prop Returns: Answer: Answer from query, or empty Answer if None """ if (self.properties['lang'] == 'en'): if jj == 'old': # How old is Obama? prop = 'age' if jj in ['tall', 'high']: # How tall is Yao Ming / Eifel tower? prop = 'height' elif (self.properties['lang'] == 'de'): if jj == 'alt': # Wie alt ist Obama? prop = 'age' if jj in ['hoch', 'groß']: # Wie hoch ist die Zugspitze? prop = 'height' if prop in ['sprache', 'sprachen']: # Welche Sprache spricht man in Sweden? prop = 'language official' if prop2: prop = prop + ' ' + prop2 if prop3 and not prop: prop = prop3 if not prop: if self.properties['lang'] == 'en' and action not in ['is', 'was']: prop = action elif self.properties['lang'] == 'de' and action not in [ 'ist', 'sind', 'war', 'hat', 'wurde', 'bedeutet' ]: prop = action ans = self.get_property(qtype, subject, prop) if not ans: ans = Answer() ans.params = { 'qtype': qtype, 'subject': subject, 'prop': prop, } return ans def get_prop_tuple(self, prop=None, value=None, op=None, value_units=None, pp_t=None): """Returns a property tuple (prop, value, op). E.g. (population, 1000000, >) Args: prop (str): Property to search for (e.g. population) value (str): Value property should equal (e.g. 10000000) op (str): Operator for value of property (e.g. >) Returns: tuple: Property tuple, e.g: (population, 10000000, >) """ self.info('Prop tuple: {0},{1},{2},{3},{4}', prop, value, op, value_units, pp_t) if op in ['in', 'by', 'of', 'from']: oper = op elif op in ['over', 'above', 'more', 'greater']: oper = '>' elif op in ['under', 'below', 'less']: oper = '<' else: self.error('NO OP {0}', op) return None # Infer property to match value if prop is None: if value_units is not None: if value_units in ['people']: prop = 'population' if not prop: return None props = [(prop, value, oper)] if pp_t: prop_tuple = match_rules(pp_t, RULES[properties['lang']]['prop_rules'], self.get_prop_tuple) if not prop_tuple: return None props += prop_tuple return props def find_entity_query(self, qtype, inst, prop_match_t=None, prop_match2_t=None): """Transforms matched context into query parameters and performs query for queries to find entities Args: qtype (str): Matched type of query (what, who, where, etc.) inst (str): Matched instance of entity to match (Obama) action (str): Matched verb action (is, was, ran) prop_match_t (Tree): Matched property Tree prop_match2_t (Tree): Matched property Tree Returns: Answer: Answer from query, or empty Answer if None """ props = [] if prop_match_t: prop = match_rules(prop_match_t, RULES[self.properties['lang']]['prop_rules'], self.get_prop_tuple) if not prop: return props += prop if prop_match2_t: prop = match_rules(prop_match2_t, RULES[self.properties['lang']]['prop_rules'], self.get_prop_tuple) if not prop: return props += prop if not inst.isupper(): inst = singularize(inst) ans = self.wd.find_entity(qtype, inst, props) if not ans: ans = Answer() ans.params = { 'qtype': qtype, 'inst': inst, 'props': props, } return ans def get_property(self, qtype, subject, prop): """Gets property of a subject Example: get_property('who', 'Obama', 'wife') = 'Michelle Obama' Args: subject: Subject to get property of prop: Property to get of subject Todo: * Add other APIs here Returns: Answer: Answer from query """ return self.wd.get_property(qtype, subject, prop) def preprocess(self, sent): """Preprocesses a query by adding punctuation""" if sent[-1] != '?': sent = sent + '?' return sent def cleanup(self, sent): """Remove some stop words""" stopwords = ['der', 'die', 'das', 'ein', 'eine', 'einen'] words = sent.split() result = [word for word in words if word.lower() not in stopwords] return ' '.join(result) def query(self, sent, format_='plain'): """Answers a query If format is plain, will return the answer as a string If format is raw, will return the raw context of query Args: sent: Query sentence format_: Format of answer to return (Default to plain) Returns: dict: Answer context str: Answer as a string Raises: ValueError: If format_ is incorrect """ sent = self.preprocess(sent) sent = self.cleanup(sent) tree = next(self.parser.raw_parse(sent)) pos = [tag for word, tag in tree.pos()] if self.properties['lang'] == 'de': if len(set(['PWS', 'PWAV', 'PWAT']) & set(pos)) == 0: print("Tree before:") for e in tree: print(str(e)) sent = "Was ist " + sent tree = next(self.parser.raw_parse(sent)) # TODO #elif self.properties['lang'] == 'en': # if len(set(['WHNP']) & set(pos)) == 0: # print("Tree before:") # for e in tree: # print(str(e)) # # sent = "What is " + sent # tree = next(self.parser.raw_parse(sent)) context = {'query': sent, 'tree': tree} for e in tree: print(str(e)) ans = first([ match_rules(tree, RULES[self.properties['lang']]['find_entity_rules'], self.find_entity_query), match_rules(tree, RULES[self.properties['lang']]['subject_prop_rules'], self.subject_query), ]) print("-> " + str(ans)) if not ans: ans = Answer() ans.query = sent ans.tree = str(tree) if format_ == 'raw': return ans.to_dict() elif format_ == 'plain': return ans.to_plain() else: raise ValueError('Undefined format: %s' % format_)
class SentenceParser: __parser = None __alpha = 1.0 __beta = 1.0 __gamma = 0.1 __var_d = 0.0 __var_s = 0.0 def __init__(self): self.__parser = StanfordParser() self.__var_d = 12.0/math.log(2.0) self.__var_s = 4.0 * 1.0/math.log(2) def __parse_sent(self, sentence): result = self.__parser.raw_parse(sentence) return result.next() def __obtain_nps(self, sentence): parse_tree = self.__parse_sent(sentence) nps = set() for phrase in parse_tree.subtrees(): if phrase.label() != "NP": continue nps.add(' '.join(phrase.leaves())) sent_tokens = " ".join(parse_tree.leaves()) #Get the smallest NPs nps_smallest = set() for np1 in nps: if all(np2 not in np1 for np2 in nps if np2 != np1): nps_smallest.add(np1) return sent_tokens, nps_smallest def __gaussian_weight(self, distance, variance): return math.exp(-0.5 * (distance**2)/variance) def __weight_tokens(self, mid, nps, sentences, sent_id): st = PorterStemmer() sent_target = sentences[sent_id] token_id = [idx for idx, token in enumerate(sent_target.strip().split(" ")) if mid in token][0] sent_lengths= [len(s.split(" ")) for s in sentences] nps_base = {np:" ".join(st.stem(token) for token in np.split(" ")) for np in nps} nps_proc = {} for sent_idx, sent in enumerate(sentences): sent_stem = " ".join(st.stem(token) for token in sent.split(" ")) for np_ori, np in nps_base.iteritems(): if np_ori not in nps_proc: nps_proc[np_ori] = {} if "dist_sent" not in nps_proc[np_ori] or abs(sent_idx - sent_id) < nps_proc[np_ori]["dist_sent"]: #always update the info if np not in sent_stem: continue np_idx = sent_stem.rindex(np) np_token_idx= len(sent_target[:np_idx].strip().split(" ")) dist_start = len(sent_stem[:np_idx].strip().split(" ")) dist_end = len(sent_stem[np_idx+len(np):].strip().split(" ")) dist_sent = abs(sent_idx - sent_id) dist_token = -1 if dist_sent == 0: if mid in np_ori: dist_token = 0 elif np_token_idx < token_id: dist_token = token_id - np_token_idx - (len(np.split(" ")) - 1) - 1 elif np_token_idx > token_id: dist_token = np_token_idx - token_id - 1 elif sent_idx < sent_id: dist_token = dist_end + sum(sent_lengths[sent_idx+1:sent_id]) + token_id elif sent_idx > sent_id: dist_token = (len(sent_target.strip().split(" "))-1-token_id) + sum(sent_lengths[sent_id+1:sent_idx]) + dist_start nps_proc[np_ori]["dist_sent"] = dist_sent nps_proc[np_ori]["dist_token"] = dist_token np_count = sent_stem.count(np) nps_proc[np_ori]["tf"] = (nps_proc[np_ori].get("tf") or 0) + np_count nps_weight = {} for np, vals in nps_proc.iteritems(): term1 = self.__alpha * self.__gaussian_weight(vals["dist_token"], self.__var_d) term2 = self.__beta * self.__gaussian_weight(vals["dist_sent"], self.__var_s) term3 = self.__gamma * vals["tf"] nps_weight[np] = (term1 + term2 + term3) / (self.__alpha + self.__beta + self.__gamma) return nps_weight def obtain_nps_from_sentences(self, mid, text): lst_sentences = sent_tokenize(text) lst_sent_pr = [] set_nps = set() sent_match_id= -1 for sent_idx, sent in enumerate(lst_sentences): if sent_match_id == -1 and mid in sent: sent_match_id = sent_idx sent_tokens, nps = self.__obtain_nps(sent) lst_sent_pr.append(sent_tokens) set_nps.update(nps) dct_nps_weight = self.__weight_tokens(mid, set_nps, lst_sent_pr, sent_match_id) return lst_sent_pr, dct_nps_weight
from nltk.parse.stanford import StanfordParser from nltk.tag import StanfordNERTagger import en import utils sentences = utils.get_tokenized_sentences("data/set1/a1.txt") parser=StanfordParser() print len(sentences) print len([ x for x in sentences if "is" in x]) [parser.raw_parse((x)) for x in sentences]
def simplify(sent): from anytree import NodeMixin, Node, AnyNode, RenderTree from nltk.parse.stanford import StanfordParser def make_tree(tree, t, sent_list): #this fn. converts nltk tree to anytree if tree not in sent_list: ttt = AnyNode(id=str(tree.label()), parent=t) for tt in tree: make_tree(tt, ttt, sent_list) else: AnyNode(id=str(tree), parent=t) parser = StanfordParser() #SBAR CASE def find_sbar(t): if t.id == 'SBAR': global sbar sbar = t for tt in t.children: find_sbar(tt) def find_vp_in_sbar(t): if t.id == 'VP': global vp_sbar vp_sbar.append(t) for tt in t.children: find_vp_in_sbar(tt) def find_np_in_sbar(t): global f global ff if t.id == 'VP': ff = False if (t.id == 'NP') and f == True and ff == True: global np_sbar np_sbar = t f = False for tt in t.children: find_np_in_sbar(tt) def find_vp(t): if t.id == 'SBAR': return global f if t.id == 'VP' and f == True: global vp vp = t f = False for tt in t.children: find_vp(tt) def find_np(t): if t.id == 'SBAR': return global f if t.id == 'NP' and f == True: global np np = t f = False for tt in t.children: find_np(tt) def find_vbz(t): if t.id == 'SBAR': return global f if t.id == 'VBZ' and f == True: global vbz vbz = t.children[0].id f = False for tt in t.children: find_vbz(tt) def make_sent(t): global simple_sentences if t.id in sent_list: simple_sentences[-1].append(t.id) for tt in t.children: make_sent(tt) #sent=sent8 parse_trees = parser.raw_parse(sent) global sent_list sent_list = [s for s in sent.split()] tree = next(parse_trees)[0] #tree.draw() t = AnyNode(id='ROOT') make_tree(tree, t, sent_list) global sbar sbar = t global vp_sbar global f global ff global np_sbar global vp global np global vbz vp_sbar = [] vp = t np = t vbz = 'bn2' np_sbar = t find_sbar(t) find_vp_in_sbar(sbar) f = True ff = True find_np_in_sbar(sbar) f = True find_vp(t) f = True find_np(t) f = True find_vbz(t) global simple_sentences simple_sentences = [] simple_sentences.append([]) make_sent(np) make_sent(vp) for i in range(len(vp_sbar)): simple_sentences.append([]) if np_sbar == t: make_sent(np) else: make_sent(np_sbar) if vbz != 'bn2': simple_sentences[-1].append(vbz) make_sent(vp_sbar[i]) #print (simple_sentences) simple = [] for sentence in simple_sentences: string = '' for word in sentence: string += word + ' ' string += '.' simple.append(string) def is_any_sbar(t): if t.id == 'SBAR': global f f = True return for tt in t.children: is_any_sbar(tt) f = False is_any_sbar(t) if f == False: simple = [sent] return simple
''' Created on Mar 11, 2016 @author: zhongzhu ''' import os from nltk.parse.stanford import StanfordDependencyParser from nltk.parse.stanford import StanfordParser from nltk.tag import StanfordNERTagger from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger('english-bidirectional-distsim.tagger') st.tag('What is the airspeed of an unladen swallow ?'.split()) st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
from nltk.tokenize import word_tokenize import script_wrapper as stanford_parser sentence = "Dempsey was drafted by Major League Soccer club New England Revolution." st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") tags = st.tag(word_tokenize(sentence)) print(tags) prev_tag_name = tags[0][1] cur_entity = tags[0][0] entities = {} for i in range(1, len(tags)): cur_tag = tags[i] cur_token = cur_tag[0] cur_tag_name = cur_tag[1] if cur_tag_name == prev_tag_name: cur_entity = cur_entity + " " + cur_token else: if not prev_tag_name in entities: entities[prev_tag_name] = [] entities[prev_tag_name].append(cur_entity) cur_entity = cur_token prev_tag_name = cur_tag_name del entities['O'] print(entities) parser = StanfordParser(path_to_jar=stanford_parser.stanford_parser_jar, path_to_models_jar=stanford_parser.stanford_model_jar) print(parser.raw_parse("Dempsey was drafted by Major League Soccer club New England Revolution.").next())
with open("example_article.txt") as f: tokenizer = PunktSentenceTokenizer() sentences = tokenizer.tokenize(f.read().decode('utf-8').replace("\n"," ")) parser=StanfordParser() print len(sentences) print len([ x for x in sentences if "is" in x]) sentences[0] = "I am going to watch a movie in the evening." sentences[0] = "I have always wondered how I have always been so good on the guitar." sentences[0] = "Our dinner has been eaten by the dog." sentences[0] = "Playing golf is my favorite pastime" sentences[0] = "He plays golf for a living" sentences[0] = sentences[0].rstrip('.') parseTree = list(parser.raw_parse((sentences[0]))) print sentences[0] # the parse tree for the entire sentence root = parseTree[0] print type(root) print root print root.pretty_print() print root.label() print ' '.join(root.leaves()) posTags = {} posTags['phrases'] = ['ADJP','ADVP','CONJP','FRAG','INTJ','LST','NAC','NP','NX','PP','PRN','PRT','QP','RRC','UCP','VP','WHADJP','WHAVP','WHNP','WHPP','X','WHADVP']
import os import sys from nltk.parse.stanford import StanfordParser if __name__ == '__main__': if not os.environ.has_key('STANFORD_PARSE_CLASSPATH'): if not len(sys.argv) == 2: print 'no stanford parse folder identify' stanford_path = raw_input('please give stanford parse folder path : ') else: stanford_path = os.environ['STANFORD_PARSE_CLASSPATH'] parser = StanfordParser(stanford_path+'/stanford-parser-3.5.1-models.jar', stanford_path+'/stanford-parser.jar') #sentence = 'A man previously convicted of harassing Yahoo CEO Marissa Mayer has been arrested by Austin police on suspicion of sending her sexually graphic emails, according to police records released on Friday.' # sentence = 'Type 2 diabetes (T2D) and Alzheimer`` disease (AD) are two major health issues nowadays. T2D is an ever increasing epidemic, affecting millions of elderly people worldwide, with major repercussions in the patients daily life.' #sentence = 'MiR-145 is reported to be significantly down-regulated in ovarian cancer.' #sentence = 'In this report, we find out that up-regulation of miR-145 in OVCAR-3 and SKOV-3 cells inhibit cell proliferation and promote cell apoptosis.' sentence = 'promoted the proliferation of ovarian cancer cells' parse_result = list(parser.raw_parse(sentence)) print parse_result print 'print out sentence structure' print parse_result[0].draw()
# -*- coding: utf-8 -*- """ Created on Sat May 13 01:29:33 2017 @author: DIP """ from nltk.parse.stanford import StanfordParser sentence = 'The quick brown fox jumps over the lazy dog' # create parser object scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') # get parse tree result = list(scp.raw_parse(sentence)) tree = result[0] # print the constituency parse tree print(tree) # visualize constituency parse tree tree.draw()
def parse(self, sentence): """Set the parse tree property for the given sentence.""" parser=StanfordParser(model_path=self.esp_model_path, path_to_models_jar=self.path_to_models_jar, path_to_jar=self.path_to_jar, encoding='utf8') self.parse_tree = parser.raw_parse(sentence) return self.parse_tree
# num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)])) # print ((float(num_chars)/float(num_words)), float(num_words)/float(num_sents), float(num_words)/float(num_vocab), fileid) for fileid in childStoryCorpus.fileids(): print (fileid) file_path = os.path.join(childStoryCorpusDir, fileid) with open(file_path, 'r') as orgf: for line in orgf: for s in tokenize.sent_tokenize(line): print(s) #print(st.tag(tokenize.word_tokenize(s))) #print(st.tag(s.split())) print(list(parser.raw_parse(s))) # for line in parser.raw_parse(s): # for sentence in line: # sentence.draw() #s = robotStoryCorpus.sents(fileid)) # for s in robotStoryCorpus.sents(fileid): # print (s) # sentences = parser.parse_sents(s) # # for tree in sentences: # list(tree) # for line in sentences: # for sentence in line:
class Converter(Dialog): def __init__(self, conversion_path=CONVERSION_PATH): with open(conversion_path, 'r') as f: self.metrics = json.load(f) self.inflect = inflect.engine() self.stemmer = SnowballStemmer('english') self.parser = StanfordParser(model_path=MODELS_PATH) def parse(self, text): parsed = self.parser.raw_parse(text) return list(parsed) def interpret(self, sents, **kwargs): measures = [] confidence = 0 results = dict() root = sents[0] if "WRB" in [tag for word, tag in root.pos()]: confidence += .2 for clause in breadth_first(root, maxdepth=8): if isinstance(clause, Tree): if clause.label() in ["S", "SQ", "WHNP"]: for token, tag in clause.pos(): if tag in ["NN", "NNS"]: measures.append(token) elif tag in ["CD"]: results["quantity"] = token measures = list(set([self.stemmer.stem(mnt) for mnt in measures])) if len(measures) == 2: confidence += .4 results["src"] = measures[0] results["dst"] = measures[1] if results["src"] in self.metrics.keys(): confidence += .2 if results["dst"] in self.metrics[results["src"]]['Destination']: confidence += .2 return results, confidence, kwargs def convert(self, src, dst, quantity=1.0): src, dst = tuple(map(self.stemmer.stem, (src,dst))) if dst not in self.metrics: raise KeyError("cannot convert to '{}' units".format(src)) if src not in self.metrics[dst]['Destination']: raise KeyError("cannot convert from {} to '{}'".format(src, dst)) units = self.metrics.get(dst).get('Units')[ self.metrics.get(dst).get('Destination').index(src) ] return units * float(quantity), src, dst def round(self, num): num = round(float(num), 4) if num.is_integer(): return int(num) return num def pluralize(self, noun, num): return self.inflect.plural_noun(noun, num) def numericalize(self, amt): if amt > 100.0 and amt < 1e6: return humanize.intcomma(int(amt)) if amt >= 1e6: return humanize.intword(int(amt)) elif isinstance(amt, int) or amt.is_integer(): return humanize.apnumber(int(amt)) else: return humanize.fractional(amt) def respond(self, sents, confidence, **kwargs): if confidence < 0.5: return "Sorry, I don't know that one." try: quantity = sents.get('quantity', 1) amount, source, target = self.convert(**sents) amount = self.round(amount) quantity = self.round(quantity) source = self.pluralize(source, quantity) target = self.pluralize(target, amount) verb = self.inflect.plural_verb("is", amount) quantity = self.numericalize(quantity) amount = self.numericalize(amount) return "There {} {} {} in {} {}".format( verb, amount, target, quantity, source ) except KeyError as e: return "I'm sorry I {}".format(str(e))
if thresh != -1 and p < thresh: return for i in range(0, len(r)): decoder(r[i], r.label(), k + 1) dep_parser = StanfordParser(path_to_jar="./stanford-parser.jar", path_to_models_jar="./stanford-models.jar") load_model() import sys filename = sys.argv[1] text = list(open(filename).readlines()) text = [s.strip() for s in text] for i in range(len(text)): s1 = clean_str(text[i]) if s1 == "": continue print 201 print text[i] a = list(dep_parser.raw_parse(s1)) for _ in range(200): decoder(a[0], "root", 0) print
wordsSplit[f]=wordsClean break ## choose tagger and tag sentence sentClean=str(wordsSplit) sentTagged=st.tag(sentClean) ## Feature 2: Completeness (capital word initial pos, punct. mark final) if wordsSplit[0][0].isupper() and (sent1.endswith(".") or sent1.endswith("!") or sent1.endswith("?")) : comp=1 class_arrays.append(comp) ## Feaure 5: Complexity (Stanford): how deeply embedded is the sentence? ##parse sentence with the Stanford parser parse=list(parser.raw_parse(sentClean.decode("utf-8"))) sentParse=str(parse).split(" ") for i in range(0, len(sentParse)): if "Tree('S'" in sentParse[i]: complexity=complexity+1 class_arrays.append(complexity) ## Feature 6: position of target word - is it at the end? sentWOPunc=sentClean.translate(string.maketrans("",""), string.punctuation) if sentWOPunc.endswith(target1): posT=1 class_arrays.append(posT) ## Make sentence without punctuation and target word lower case
import os import sys import nltk from nltk.parse.stanford import StanfordParser f = open(sys.argv[1]) text = f.read() text = text.decode('utf-8') sents = nltk.sent_tokenize(text) print sents modelPath = 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' parser = StanfordParser(model_path = modelPath) for s in sents: print list(parser.raw_parse(s))
@author: DIP """ sentence = 'The brown fox is quick and he is jumping over the lazy dog' # set java path import os java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe' os.environ['JAVAHOME'] = java_path from nltk.parse.stanford import StanfordParser scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') result = list(scp.raw_parse(sentence)) print result[0] result[0].draw() import nltk from nltk.grammar import Nonterminal from nltk.corpus import treebank training_set = treebank.parsed_sents() print training_set[1] # extract the productions for all annotated training sentences treebank_productions = list( set(production for sent in training_set for production in sent.productions())
with open("data/rt-polarity.neg.txt",mode="r") as f: neg_sent.append(f.readline()) neg_sent.append(f.readline()) neg_sent.append(f.readline()) with open("data/rt-polarity.pos.txt",mode="r") as f: pos_sent.append(f.readline()) pos_sent.append(f.readline()) pos_sent.append(f.readline()) trees = [] labels = [0]*3 + [1]*3 sents = pos_sent + neg_sent for sent in sents: a = list(parser.raw_parse(sent)) hytree = a[0] chomsky_normal_form(hytree) trees.append(hytree[0]) rnn = RecursiveNeuralNetworl(embsize=300,mnb_size=6,wordvector=wordvector) trees[0].pretty_print() for tree,label in zip(trees,labels): root_node, softmax_layer, cost, pred = rnn.forward(tree,label) print("correct {0}, predict {1}, cost {2}".format(label,pred,cost))
st_ner = StanfordNERTagger(model_filename=stanford_ner_model, path_to_jar=stanford_ner_jar) #print st_ner.tag('Rami Eid is studying at Stony Brook University in New York'.split()) print st_ner.tag( "Gandalf deduces Sauron will attack Gondor 's capital Minas Tirith , riding there with Pippin?" .split()) print "========= Checking PARSER =========" stanford_parser = 'stanford/stanford-parser-full-2015-04-20/' eng_model_path = stanford_parser + "englishPCFG.caseless.ser.gz" stanford_parser_model = stanford_parser + 'stanford-parser-3.5.2-models.jar' stanford_parser_jar = stanford_parser + 'stanford-parser.jar' st_parser = StanfordParser(model_path=eng_model_path, path_to_models_jar=stanford_parser_model, path_to_jar=stanford_parser_jar) parser_result = (st_parser.raw_parse( 'Rami Eid is studying at Stony Brook University in Los Angeles')) for S in parser_result: if S[0][0].label() == 'NP' and S[0][1].label() == 'VP': subject_words = S[0][0].leaves() print subject_words print st_ner.tag(subject_words) ''' if type(node) is nltk.Tree: if node.label() == ROOT: print "======== Sentence =========" print "Sentence:", " ".join(node.leaves()) else: print "Label:", node.label() print "Leaves:", node.leaves()
from nltk.tokenize.stanford import StanfordTokenizer ''' from practnlptools.tools import Annotator #Testing the abilities of nltk testString = "I made a poop in my pants" #part of speech tagging print(nltk.pos_tag(nltk.word_tokenize(testString))) print(nltk.__version__) annotator=Annotator() notes = annotator.getAnnotations("There are people dying make this world a better place for you and for me.") print notes['syntax_tree'] ''' import os from nltk.parse.stanford import StanfordParser parserPath = "C:\Users\Roger Liu\Desktop\NLP-Final-Project\Libraries\stanford-parser-full-2015-04-20" os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk1.8.0_40/bin' #or your java path os.environ['CLASSPATH'] = parserPath + '/stanford-parser.jar' os.environ['STANFORD_MODELS'] = parserPath + '/stanford-parser-3.5.2-models.jar' # I am using version 3.5.2 because apparently it is the more stable version, you should replace 3.5.2 with whatever version you're using sentence = "Stanford parser is slow" parser=StanfordParser() print list(parser.raw_parse(sentence)) print "what"