def syntax_tree_parser(self): """ get syntax tree :return: syntax tree """ if self.syntax_tree is not None: return self.syntax_tree parser = CoreNLPParser(url='http://localhost:8999') self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0] return self.syntax_tree
def nltk_stuff(data): parser = CoreNLPParser(url='http://localhost:9000') num_nodes = 0 num_s = 0 height = 0 subset = dict(random.sample(data.items(), 5000)) get_lexicon_stats(subset) print("\n") for _, value in subset.items(): parsed = list(parser.parse(value.split())) height += parsed[0].height() curr_num_nodes, curr_num_s = count_nodes(parsed[0][0], 0, 0) num_nodes += curr_num_nodes num_s += curr_num_s print("Num clauses: {}".format(num_s)) print("Num nodes: {}".format(num_nodes)) print("Total height: {}".format(height)) print("Avg nodes per tweet: {}".format(num_nodes / len(subset.items()))) print("Avg height of tweet: {}".format(height / len(subset.items())))
if len(children) != len(temp.leaves()): for j in children: if j != ' '.join(tree.label().split()[1:]) and (j not in [ i[1] for i in deplist ]): deplist.append([' '.join(tree.label().split()[1:]), j]) return deplist def sorting(elem): return elem[1].split()[2] parser = CoreNLPParser(url='http://localhost:9000') #Read the sentence and form the parse tree parser_output_splitted = parser.parse(sentence.split()) for j in parser_output_splitted: temp = j #Part 1 parser_output = traverse_tree(temp) print(parser_output) #Part 2 precedence_list = [ 'VP', 'VB', 'VBG', 'VBZ', r'V*', 'NP', 'NN', r'N*', 'ADVP', 'JJ', 'P', 'RB', 'PRP$', 'PRP' ] deptree = dep_tree(parser_output) print(deptree)
def read_data(): tokenizer = RegexpTokenizer(r'\w+') tokenized_sentences = [] sentences = [] for _, _, file in os.walk("../../data/parsing_corpus"): for filename in file: with open("../../data/parsing_corpus/" + filename, "r") as f: contents = f.read() contents = contents.split("\n") for i in range(len(contents)): temp_tokenized_sentence = tokenizer.tokenize(contents[i]) if(len(temp_tokenized_sentence) <= 50): tokenized_sentences.append(temp_tokenized_sentence) sentences.append(contents[i]) return tokenized_sentences, sentences tokenized_sentences, sentences = read_data() parser = CoreNLPParser(url = 'http://localhost:9000') tree_sentences = [] for i in range(len(tokenized_sentences)): if(tokenized_sentences[i]): a = list(parser.parse(tokenized_sentences[i])) tree_sentences.append(a) with open('./CFG trees.txt', 'w') as filehandle: for i in range(len(tree_sentences)): filehandle.write(sentences[i]) filehandle.write("\n") filehandle.write(str(tree_sentences[i])) filehandle.write("\n\n")
if f"{token['text']}-{token['tag']}" not in all_words: all_words.add(f"{token['text']}-{token['tag']}") # If not then check all the synsets pos = None synsets = wn.synsets(token["text"], pos=pos) for syn in synsets: for lem in syn.lemmas(): # Print lemmas for all the synonyms token_synonyms.append(lem.name()) for hyp in syn.hypernyms(): # Print all hypernyms token_hypernyms.append(hyp.name()) lookup_data.append((token['text'], token['tag'], token_synonyms, token_hypernyms, synsets)) from nltk.parse import CoreNLPParser import io import time # # You have to start the server first using # $ bash ./start_server.sh" command # parser = CoreNLPParser(url='http://localhost:9069') with open('wordnet_trees.txt', mode='a') as results_file: # Go through all pairs in the corpus for i, pair in enumerate(corpus): input_text = pair.text tokens_text = [word for word in nltk.word_tokenize(input_text) if word.isalnum()] for tree in parser.parse(tokens_text): tree.pretty_print(stream=results_file)
def reorder_syntactic_tokenized_sentence_regex(logger=None, lst_sentence=None, use_stanford_parser=True, verbose=True): #### WARNING!!! needed that str_structure betwwen start_nest and end_nest to get a root-point in tree (as nested sting) def parse_nested_structure_with_regex(str_structure, logger=None, verbose=True, start_nest='[', end_nest=']'): if logger is not None: logger.info( ' ******** input nested structure : {}'.format(str_structure)) lst_structure = list(str_structure) #### ######## STEP 1: detect positions of nest start-symbols and end-symbols #### ...we get the positions of start nest and end nest as boolean lists... lst_bool_start_nest_pos = [ True if lst_structure[i_chr] == start_nest else False for i_chr in range(len(lst_structure)) ] lst_bool_end_nest_pos = [ True if lst_structure[i_chr] == end_nest else False for i_chr in range(len(lst_structure)) ] df_nested_structure = pd.DataFrame({ 'symbol_start': [], 'symbol_end': [], 'layer_x': [], 'layer_depth': [], 'father_layer_x': [], 'father_layer_depth': [], 'pos_char_start': [], 'pos_char_end': [], 'string_fragment': [] }) #### ######## STEP 2: detect nested substructures #lst_ctrl = lst_structure #ctrl = True while True: #### ...initial positions of root-tree... current_layer_x = 0 current_layer_depth = -1 #### ...initial positions on lists of detected parenthesis... current_start_pos = 0 current_end_pos = 0 #### ...run on start parenthesis matched... for i_start in range(len(lst_bool_start_nest_pos)): #print('*** character {0} - position {1}'.format(lst_structure[i_start], i_start)) if lst_bool_start_nest_pos[i_start]: #print('*** start parenthesis found: {}'.format(lst_structure[i_start])) current_start_pos = i_start current_layer_depth = current_layer_depth + 1 for i_end in range(i_start + 1, len(lst_bool_start_nest_pos)): #print('*** searching end parenthesis found: {}'.format(lst_structure[i_end])) if lst_bool_start_nest_pos[i_end]: break if lst_bool_end_nest_pos[i_end]: current_end_pos = i_end break if current_end_pos > 0: if logger is not None and verbose: logger.info('**** nested parenthesis found!! ****') logger.info( 'start pos: {0} - end pos: {1} - expresion: {2}' .format( current_start_pos, current_end_pos, ''.join(lst_structure[ current_start_pos:current_end_pos + 1]))) logger.info( 'layer_x: {0} - layer_depth: {1}'.format( current_layer_x, current_layer_depth)) #### ...define current_layer_x... if len(df_nested_structure[ df_nested_structure['layer_depth'] == current_layer_depth]) > 0: df_nested_x = df_nested_structure[ df_nested_structure['layer_depth'] == current_layer_depth] #df_nested_x = df_nested_x.sort_values(by=['pos_char_start']) current_layer_x = df_nested_x['layer_x'].max() + 1 #### ...add substructure found in output dataframe... lst_row_to_append = [ start_nest, end_nest, current_layer_x, current_layer_depth, 0, 0, current_start_pos, current_end_pos, ''.join(lst_structure[ current_start_pos:current_end_pos + 1]) ] row_to_append = pd.Series( lst_row_to_append, index=df_nested_structure.columns) df_nested_structure = df_nested_structure.append( row_to_append, ignore_index=True) #### ...remove parenthesis found... lst_bool_start_nest_pos[current_start_pos] = False lst_bool_end_nest_pos[current_end_pos] = False break #### ...if all parenthesis found ctrl = False and finish while loop... if True not in lst_bool_start_nest_pos or True not in lst_bool_end_nest_pos: if True not in lst_bool_start_nest_pos and True not in lst_bool_end_nest_pos: #ctrl = False break else: if logger is not None: logger.error( ' - troubles with nested string: {0}'.format( str_structure)) raise Exception #### ######## STEP 3: set father node/tree (setting its indexes layer_x and layer_depth) #### ...we define the substructures length; with this we can get which structure contains which... df_nested_structure['substring_length'] = df_nested_structure[ 'pos_char_end'] - df_nested_structure['pos_char_start'] #### ...we get, for each nested-strucuture found, the structure shorter containining it... for i_nest, row in df_nested_structure.iterrows(): pos_start = df_nested_structure.iloc[i_nest]['pos_char_start'] pos_end = df_nested_structure.iloc[i_nest]['pos_char_end'] df_nest_aux = df_nested_structure[ (df_nested_structure['pos_char_start'] < pos_start) & (df_nested_structure['pos_char_end'] > pos_end)] df_nest_aux = df_nest_aux[df_nest_aux['substring_length'] == df_nest_aux['substring_length'].min()] if len(df_nest_aux) > 0: df_nested_structure.loc[ i_nest, 'father_layer_x'] = df_nest_aux.iloc[0]['layer_x'] df_nested_structure.loc[ i_nest, 'father_layer_depth'] = df_nest_aux.iloc[0]['layer_depth'] df_nested_structure = df_nested_structure.drop('substring_length', axis=1) return df_nested_structure try: #### ...to control words that we used... lst_sentence_ctrl = list(lst_sentence) #### ...to store output... sentence_reordered = list(lst_sentence) parser = None if use_stanford_parser: parser = CoreNLPParser(url='http://localhost:9000') else: if logger is not None: logger.warn('we need you up standford nlp parser') #### #### lst_root_trees = [] for tree in parser.parse(sentence_reordered): lst_root_trees.append(tree) #### ...if would have a list of trees, we would pick the first... root_tree = lst_root_trees[0] print(root_tree) #### ...we apply parsing... p = list(parser.parse(sentence_reordered)) #### ...just in case.... as string to treat with regex... str_tree = str(p) df_tree_structure = parse_nested_structure_with_regex(str_tree, logger=logger, verbose=True) print(df_tree_structure) if logger is not None: logger.info('reordered sentence: '.format(sentence_reordered)) #### ...warning if for some trouble, len(input sentence) != len(output sentence) if logger is not None and len(lst_sentence) != len(sentence_reordered): logger.warn('input and output sentences with different length') except Exception as e: if logger is not None: logger.exception( 'ERROR reordering phrase: {}'.format(lst_sentence)) raise e return sentence_reordered
from nltk.parse import CoreNLPParser from nltk.corpus import treebank from printModule import * sentence1 = "Patient with HGM value greater than 55 g/L" sentence = "When did princes Diana die?" # Lexical Parser parser = CoreNLPParser(url='http://localhost:9000') # Parse tokenized text. print("\nParse tokenized text") # print(list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))) print(list(parser.parse(sentence.split()))) # [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])] print("\nRaw string") # Parse raw string. print(list(parser.raw_parse(sentence))) # [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])] # Neural Dependency Parser print("\nNeural Dependency Parser") from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') parses = dep_parser.parse(sentence.split()) # [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses] # [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]] # Tokenizer parser = CoreNLPParser(url='http://localhost:9000')
from nltk.parse import CoreNLPParser from nltk.tree import * parser = CoreNLPParser(url='http://localhost:9000') ls = list( parser.parse("Alfonso XIII of Spain's birth place is Madrid.".split())) ls = ParentedTree.fromstring(str(ls[0])) def getSubject(t): for s in t.subtrees(lambda t: t.label() == 'NP'): for n in s.subtrees(lambda n: n.label().startswith('NN')): return (n[0], getAttributes(n)) def getPredicate(t): v = None for s in t.subtrees(lambda t: t.label() == 'VP'): for n in s.subtrees(lambda n: n.label().startswith('VB')): v = n return (v[0], getAttributes(v)) def getObject(t): for s in t.subtrees(lambda t: t.label() == 'VP'): for n in s.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']): if n.label() in ['NP', 'PP']: for c in n.subtrees(lambda c: c.label().startswith('NN')): return (c[0], getAttributes(c)) else:
from nltk.parse import CoreNLPParser from nltk.tree import Tree parser = CoreNLPParser(url='http://localhost:9000') f = open("../Fragments_for_testing/text2", "r") sentence = f.read() list_tree = str(list(parser.parse(sentence.split()))) list_tree = list_tree.replace('Tree','') list_tree = list_tree.replace('\'','') list_tree = list_tree.replace(',','') list_tree = list_tree.replace('[','') list_tree = list_tree.replace(']','') list_tree = list_tree.replace('(. .)','') list_tree = list_tree.replace('(. !)','') list_tree = list_tree.replace('ROOT','S1') tree = Tree.fromstring(list_tree) tree.draw()
from nltk import word_tokenize, sent_tokenize, bigrams, trigrams from nltk.tag import StanfordPOSTagger from nltk.parse import CoreNLPParser from docx import Document #cd Documents\Repos\investigacion\source\ #Configuracion inicial del POSTagger de Stanford (verificar que la ruta sea la correcta, sino no funciona) tagger = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\models\spanish.tagger' parser = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-corenlp\spanishPCFG.ser.gz' jar1 = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\stanford-postagger.jar' jar2 = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-corenlp\stanford-corenlp-3.9.1.jar' etiquetador = StanfordPOSTagger(tagger,jar1) parseador = CoreNLPParser(url='http://localhost:9000') print(list(parseador.parse('El viejo hombre se sentó solo sobre la montaña a observar el horizonte'.split())))
import nltk from nltk.parse import CoreNLPParser # Creates object for CoreNLPParser parser = CoreNLPParser(url='http://localhost:9000') # Reading sentences from input file file = open('../data//input.txt', 'r') sentences = [line.strip().split() for line in file.readlines()] file.close() # Creating output file to write result file = open('../output.txt', 'w') with file as out: for sent in sentences: tokens = sent parsetree = list(parser.parse(tokens)) file.write(" ".join(sent) + "\n\n") file.write(str(parsetree[0])) file.write( "\n\n=================================================================================\n\n" ) file.close()
def reorder_syntactic_tokenized_sentence(logger=None, lst_sentence=None, use_stanford_parser=True, verbose=True): try: #### ...to control words that we used... lst_sentence_ctrl = list(lst_sentence) #### ...to store output... sentence_reordered = list(lst_sentence) parser = None if use_stanford_parser: parser = CoreNLPParser(url='http://localhost:9000') else: if logger is not None: logger.warn('we need you up standford nlp parser') #### ...we apply parsing... #p = list(parser.parse(sentence_reordered)) #### ...just in case.... as string to treat with regex... #str_tree = str(p) #print(str_tree) #### ...we get the first syntactic root tree (just in case we have several trees)... lst_root_trees = [] for tree in parser.parse(sentence_reordered): lst_root_trees.append(tree) #print(tree[0]) #print(len(tree[0])) #print(tree[0,0]) #### ...if would have a list of trees, we would pick the first... root_tree = lst_root_trees[0] #### ...we get all subtrees and load in a list... lst_subtrees = [] for subtree in root_tree.subtrees(): lst_subtrees.append(subtree) #### ...selecting leaves from bottom to up (aproximately)... #### ...trees ..are run and loaded from up to bottom... so, we reverse the list... lst_subtrees.reverse() sentence_reordered.clear() #### ...in while loop we take leaves from each tree and remove from lst_sentence (input) #### ...loop stop when we have runed all trees or we already take all words in original phrase if logger is not None: logger.info( 'syntactic reorder in input phrase: {}'.format(lst_sentence)) i = 0 while i < len(lst_subtrees): subtree = lst_subtrees[i] leaves = subtree.leaves() label = subtree.label() for leave in leaves: if leave in lst_sentence_ctrl: sentence_reordered = sentence_reordered + [leave] #### ...delete used word from lst_sentence_ctrl lst_sentence_ctrl.remove(leave) if logger is not None and verbose: logger.info('subtree {} from {}'.format( i + 1, len(lst_subtrees))) logger.info(subtree) logger.info('tree label: {}'.format(label)) logger.info('input phrase: {}'.format(lst_sentence)) logger.info('tree leaves: {}'.format(leaves)) logger.info('reordered words: {}'.format(sentence_reordered)) logger.info( 'remaining words to order: {}'.format(lst_sentence_ctrl)) i = i + 1 #### ...end loop if lst_sentence_ctrl already is empty (all words used) if len(lst_sentence_ctrl) == 0: break if logger is not None: logger.info('reordered sentence: '.format(sentence_reordered)) #### ...warning if for some trouble, len(input sentence) != len(output sentence) if logger is not None and len(lst_sentence) != len(sentence_reordered): logger.warn('input and output sentences with different length') except Exception as e: if logger is not None: logger.exception( 'ERROR reordering phrase: {}'.format(lst_sentence)) raise e return sentence_reordered
o_tree =o_tree[0] res=res+o_tree.label()+'('+' '.join(subtree[i].leaves())+')' else: res=res+subtree[i].label()+'('+' '.join(subtree[i].leaves())+')' return res # # if type(subtree) == ParentedTree and len(subtree) > 1: # return subtree # else: # return None if __name__ == '__main__': res=[] parser = CoreNLPParser(url='http://localhost:9000') # dataset=pd.read_csv("Result.csv",sep=',',header=None) sentence = [bala for bala in parser.parse("right with something thing white in his hand".split())] sentence = ParentedTree.convert(sentence[0]) print(sentence) # res.append(result(sentence)) # tree = parser.parse("old lady with glasses holding teddy bear") # sentence = ParentedTree.convert(tree) # print(result(sentence)) # for i in range(0,dataset.shape[0]): # # for i in range(0,1): # sentence = [bala for bala in parser.parse(dataset[1][i].split())] # sentence = ParentedTree.convert(sentence[0]) # res.append(result(sentence)) # print(i) # dataset[3] = res # print(dataset)
class custom_parse_handler: corenlp_host = 'http://localhost:9000' # CoreNLP server host main_categories = ['geography', 'music', 'movies'] # Categories utilized in the project def __init__(self, input_file, output_file, dbConnector): self.ip_file = input_file # Input statements self.op_file = output_file # Generated output streamed to this file apart from the command prompt self.parser = CoreNLPParser( url=self.corenlp_host ) # Initializing the connection with CoreNLP parse self.testParserConnection() # Setting up the word2vec model corpusFilePath = os.path.dirname(os.path.realpath( __file__)) + os.path.sep + "tools" + os.path.sep + "word2vec" corpusFileName = "GoogleNews-vectors-negative300.bin" self.filePath = corpusFilePath # file path self.fileName = corpusFilePath + os.path.sep + corpusFileName # Constructing the full path for the file name self.model = KeyedVectors.load_word2vec_format(self.fileName, binary=True) self.stopWords = nltk.corpus.stopwords.words('english') self.fileNewLine = "\n" self.dbConnector = dbConnector def testParserConnection(self): str = "This is a test statement" try: list(self.parser.parse(str.split())) except Exception as e: print("Error while connecting to CoreNLP server. Exiting.") sys.exit() def getParseTree(self, sentence): return list(self.parser.parse(sentence.split())) def displayConstructedParseTree(self, parseTree, fileObj=None): for entry in parseTree: if fileObj is None: entry.pretty_print() else: entry.pretty_print(stream=fileObj) def updatePredictedCategoryForWord(self, entry, category_sum): for i in range(len(self.main_categories)): try: sim_val = self.model.similarity(entry, self.main_categories[i]) category_sum[self.main_categories[i]] += sim_val except KeyError: pass return category_sum def getCategoryWithMaxVoting(self, categoryMap): max_val = None max_category = None for entry in categoryMap: val = categoryMap[entry] if max_val is None or val > max_val: max_val = val max_category = entry return max_category def assignCategory(self, statement): # Special case: Statements associated with Geography are misclassified when beginning with "where is" if statement.lower().startswith('where is'): return 'geography' tokens = list(self.parser.tokenize(statement)) filtered_words = [w for w in tokens if not w in self.stopWords] filtered_words_lower = [w.lower() for w in filtered_words] category_sum = {} for entry in self.main_categories: # Exclude the category 'geography' if the words related to 'birth' are present in the sentence if (entry != 'geography' and 'capital' not in filtered_words_lower ) or (entry == 'geography' and not ('born' in filtered_words_lower or 'birth' in filtered_words_lower)): category_sum[entry] = 0 for entry in filtered_words: category_sum = self.updatePredictedCategoryForWord( entry, category_sum) return self.getCategoryWithMaxVoting(category_sum) # Direct the output to the default output stream and an output file. def outputGenerator(self, statement, query, answer, opFileObj=None): if opFileObj is not None: opFileObj.write("<QUESTION> " + statement) opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) if query is not None: opFileObj.write("<QUERY> " + query) else: opFileObj.write("<QUERY> ") opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) opFileObj.write("<ANSWER> " + answer) opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) opFileObj.write(self.fileNewLine) print("<QUESTION> ", statement, "\n") if query is not None: print("<QUERY> ", query, "\n") else: print("<QUERY>\n") print("<ANSWER> ", answer, "\n\n") # Process the parse tree to extract projections and perform translation into SQL queries def extractProjections(self, parseTree, queryObj, category): # Starts off the entire tree recursion process for entry in parseTree: # entry.pretty_print() self.processRecurse(None, entry, queryObj, category) # Recursively traverse the parse tree using DFS and generate transitions for each parent, child node-pair def processRecurse(self, parent, treeObj, queryObj, category): if type(treeObj) != nltk.tree.Tree: # leaf node transition_obj = transition(parent, treeObj, None, queryObj, category) return treeObj, transition_obj if "." == treeObj.label() or "DT" == treeObj.label( ): # do not handle determiners or punctuation return "", None str_transition = treeObj.label() + " " + "->" current_children = [] for i in range(len(treeObj)): label, transition_obj_inter = self.processRecurse( treeObj.label(), treeObj[i], queryObj, category) if transition_obj_inter is not None: current_children.append(transition_obj_inter) str_transition += " " + label if treeObj.label() != 'ROOT': transition_obj_fin = transition(parent, str_transition, current_children, queryObj, category) else: # Root of the tree is encountered transition_obj_fin = None return treeObj.label(), transition_obj_fin def parseInputFile( self ): # Parse the statements in the input file sequentially and perform semantic tranformation ipFileObj = open(self.ip_file, "r") opFileObj = open(self.op_file, "w") try: for entry in ipFileObj: question = entry.strip() if not question.startswith('--'): queryObj = queryForm() parseTree = self.getParseTree( question) # Generate parse tree category = self.assignCategory( question) # Assign probable category self.extractProjections( parseTree, queryObj, category ) # Extract the projections and generate the query object # queryObj.printComponents() queryObj.constructQuery() # Construct the final query results = self.dbConnector.getResults( queryObj, category ) # Execute the query in the database and generate results self.outputGenerator(question, queryObj.getQueryStr(), results, opFileObj) except Exception as e: print("Error while processing.") print(e) finally: ipFileObj.close() opFileObj.close()
def __init__(self, text): self.text = text self.tokens = nltk.word_tokenize(text) parser = CoreNLPParser(url='http://localhost:8999') self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0]
# TODO make this test throw messages saying if the output is correct or not. from nltk.parse import CoreNLPParser # Lexical Parser parser = CoreNLPParser(url='http://localhost:9000') # Parse tokenized text. print( list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))) print( "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n" ) # Parse raw string. print(list(parser.raw_parse('What is the airspeed of an unladen swallow ?'))) print( "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n" ) # Neural Dependency Parser from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') parses = dep_parser.parse( 'What is the airspeed of an unladen swallow ?'.split()) print([[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]) print( "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n" )
return len(rhs) == 1 and isinstance(rhs[0], str) parser = CoreNLPParser(url="http://localhost:9000") sentences = brown.sents() # FILTER SHORT AND LONG SENTENCES filter_sentences = [] for sentence in tqdm(sentences): nb_words = number_of_words(sentence) if nb_words >= 5 and nb_words <= 10: filter_sentences.append(sentence) # PARSE SENTENCES productions = [] for sentence in tqdm(filter_sentences): parse_tree = next(iter(parser.parse(sentence))) productions += parse_tree.productions() unique_productions = list(set(productions)) # REMOVE TERMINAL SYMBOLS productions_wo_term = [] for prod in unique_productions: if not is_rhs_terminal(prod): productions_wo_term.append(prod) grammar = CFG(start=Nonterminal("ROOT"), productions=productions_wo_term) pickle.dump(grammar, open("brown_grammar.pickle", "wb"))
trees = ne_chunk(sent) for tree in trees: if hasattr(tree, 'label'): if tree.label() in labels: entities.append(' '.join( [child[0].lower() for child in tree])) return entities # run this you have to connect to api # go to dir - stanford-corenlp-full-2018-02-27 # the two lines below type in terminal as one line # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer # -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 & from nltk.parse import CoreNLPParser parser = CoreNLPParser(url='http://localhost:9000') list(parser.parse(doc)) # for sentence tokenized doc list(parser.raw_parse(doc)) # for non tokenized docs # on tokenized list of words pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') list(pos_tagger.tag(doc)) ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') list(ner_tagger.tag(doc)) from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') list(dep_parser.parse(doc))
senid += 1 # stopline = 1899 # if senid != stopline: # continue # senid = stopline fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase) alignment = do_alignment(fe_alignment, ef_alignment, len(ef_phrase[0]), len(fe_phrase[0])) # fe_phrase = fe_phrases[id] # ef_phrase = ef_phrases[id] BP, BP_pos = phrase_extraction(fe_phrase[0], ef_phrase[0], alignment) # fe_phrase[0] 是 e 句子 f_sen = ' '.join(ef_phrase[0]) try: p_parse_trees = list(parser.parse(parser.tokenize(f_sen))) except ValueError: print('parsing fail') exception_sen.append(senid) p_parse_trees = [Tree.fromstring('(S (NULL ERROR))') ] # we simply give a dummy tree # create a dict to keep all phrase in different categories p_phrase_dict = {} for tag in phrase_tag: p_phrase_dict[tag] = [] for one_tree in p_parse_trees: # print(one_tree) traverse(one_tree, p_phrase_dict, phrase_tag)
parser = CoreNLPParser(url='http://localhost:9001') print('parser generated!') exception_sen = [] tree_list = [] p_phrase_trees = None f_input = open(args.input, mode='r', encoding='utf-8') f_output = open(args.output, mode='w', encoding='utf-8') # f_output = open(args.output, 'wt') for senid, line in enumerate(f_input): print(senid) # if senid > 100: # break try: p_parse_trees = list(parser.parse(parser.tokenize(line))) except ValueError: print('parsing fail') exception_sen.append(senid) p_parse_trees = [Tree.fromstring('(S (NULL ERROR))') ] # we simply give a dummy tree f_output.write('%d\n' % len(p_parse_trees)) for sub_tree in p_parse_trees: f_output.write(str(sub_tree)) f_output.write('\n|||\n') # str_tree = ' '.join(p_parse_trees) # f_output.write(str_tree) # f_output.write('\n') # tree_list.append(p_parse_trees) f_input.close()
for sentence in sentences: l += len(tokenize(sentence)) print("part:{}, label:{}, mean token count: {}".format(part, label, str(l/len(df)))) l = set() for i in premise: l |= set(tokenize(sentence)) for part in parts: if part == "all": continue else: sentences = data[part].to_list() f = open("{}_{}".format(data_file, part),"a") for sentence in sentences: p = list(parser.parse(sentence.split())) for w in p: f.write(' '.join(str(w).split()) ) f.write("\n") hypothesis = [] f = open("{}_hypothesis".format(data_file), 'r') for i in f: hypothesis.append(i) premise = [] f = open("{}_premise".format(data_file), 'r') for i in f: premise.append(i) df = pd.DataFrame([],index=list(tags_to_results.keys())) tags_to_results = defaultdict(list) def log(tag, is_correct, label): tags_to_results[tag].append((is_correct, label))