def parser_nltk(word_lists, filename): os.environ['JAVAHOME'] = JAVA_PATH os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS chinese_parser = StanfordParser(model_path=nltk_parse_model_path) STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0] chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR)) chinese_parser.java_options = '-mx15000m' all_parser_sentence = [] file = shelve.open(filename) flag = 0 for sentence in word_lists: if sentence.strip() != "": res = list(chinese_parser.parse((sentence.strip()).split())) new_str = return_str_tofile(sentence_parse=str(res[0])) file[str(flag)] = res all_parser_sentence.append(new_str) flag += 1 print("###### NLTK Dependency Parser Have finished " + str(flag) + " sentences ###") return all_parser_sentence
stanford_parser_dir = os.path.join(os.getcwd(), SETTINGS.get('paths', 'stanfordParser')) my_path_to_jar = os.path.join(stanford_parser_dir, 'stanford-parser.jar') my_path_to_models_jar = os.path.join(stanford_parser_dir, 'stanford-parser-3.6.0-models.jar') eng_model_path = os.path.join( stanford_parser_dir, 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') parser = StanfordParser(model_path=eng_model_path, path_to_models_jar=my_path_to_models_jar, path_to_jar=my_path_to_jar, java_options='-mx5000m') parser._classpath = tuple([j for j in parser._classpath] + [ stanford_parser_dir + '/slf4j-api.jar', stanford_parser_dir + '/slf4j-simple.jar' ]) for r, ds, fs in os.walk(heldout_raw_path): ds.sort() fs.sort() file_counter = 0 already_parsed = os.listdir(heldout_parse_path) files = [ f for f in fs if f[:1] in ('E', 'F', 'G') and f not in already_parsed ] files_count = len(files) for f in files: file_counter += 1 print f, file_counter / float(files_count)
#!/bin/env python3.5 #Author: Saurabh Pathak from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser from nltk.tokenize import sent_tokenize from nltk import download from nltk.tree import ParentedTree import os #download('punkt', quiet=True) #download('names', quiet=True) os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09') text = input('Enter some text:') tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))] tlist2 = [tree.copy(True) for tree in tlist] from hobbs import * from lappinleasse import * print('Input text was:\n', text) def resolve(ls, algo): print('\nResolving with', algo) i = -1 for parsetree in ls: i += 1
# -*- coding: utf-8 -*- # export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar from __future__ import unicode_literals import os import sys import io import copy import nltk from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser parser = StanfordParser( model_path= "stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) # from set_parser import parse_it class Node(object): """ A generic representation of a tree node. Includes a string label and a list of a children. """ def __init__(self, label): """ Creates a node with the given label. The label must be a string for use with the PQ-Gram algorithm. """ self.label = label self.children = list()
def preprocess(flist, folder_path): """ (file open for reading, str) -> Nonetype flist contains one filename per line and folder_path represents a directory. Do preprocessing on each file from flist in folder_path. """ error_log = [] for i in range(len(flist)): path = flist[i] stemmer = PorterStemmer() parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', verbose=True) stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) with open(path, 'r') as rf: try: sent = [line.strip('\n ') for line in rf] except UnicodeDecodeError as e: error_log.append('Unicode Decode Error:\t' + path + '\n') pass else: if not sent: error_log.append('Empty File Error:\t' + path + '\n') pass else: # Stemming with Porter Stemmer pars_stem = stemmer.stem(' '.join(sent)) stemmed = '\n'.join(sent) wf = open( folder_path + path.split('.')[0].split('/')[-1] + '.stem', 'w') wf.write(stemmed) wf.close() # POS Tagging after tokenizing and stemming pos = nltk.pos_tag(pars_stem.split()) wf = open( folder_path + path.split('.')[0].split('/')[-1] + '.pos', 'w') wf.write(str(pos)) wf.close() # CFG parser try: parsed = parser.raw_parse(pars_stem) except (TypeError, IndexError, NameError) as e: error_log.append('Unparsable Error:/t' + path + '/n') pass wf = open( folder_path + path.split('.')[0].split('/')[-1] + '.pars', 'w') s_pars = " ".join(str(x) for x in list(parsed)) s_pars = s_pars.replace("Tree", "") s_pars = s_pars.replace("[", "") s_pars = s_pars.replace("]", "") s_pars = s_pars.replace("\'", "") wf.write(s_pars) wf.close() # Print files paths with Errors if error_log: wf = open(folder_path + 'error_log', 'wb') for line in error_log: wf.write(line) wf.close()
# -*- coding: utf-8 -*- # export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar from __future__ import unicode_literals import os import sys import io import copy import nltk from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser parser=StanfordParser(model_path="stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) # from set_parser import parse_it class Node(object): """ A generic representation of a tree node. Includes a string label and a list of a children. """ def __init__(self, label): """ Creates a node with the given label. The label must be a string for use with the PQ-Gram algorithm. """ self.label = label self.children = list() def addkid(self, node, before=False): """ Adds a child node. When the before flag is true, the child node will be inserted at the beginning of the list of children, otherwise the child node is appended.