def callStanfordNER(self, sentence): # This function can call Stanford Name Entity Recognizer and support getEvent function. encoding = "utf8" default_options = ' '.join(_java_options) with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(sentence, compat.text_type) and encoding: input_ = sentence.encode(encoding) input_file.write(input_) input_file.flush() input_file.seek(0) cmd = [ "java", "-cp", ner_loc + "/stanford-ner.jar:" + ner_loc + "/lib/*", "-Xmx20g", "edu.stanford.nlp.ie.crf.CRFClassifier", "-loadClassifier", ner_loc + "/classifiers/english.all.3class.distsim.crf.ser.gz", '-encoding', encoding, '-textFile', input_file.name, "-ner.useSUTime", "false" ] devnull = open(os.devnull, 'w') out = subprocess.check_output(cmd, stderr=devnull) #print(out) out = out.replace(b'\xc2\xa0', b' ') out = out.replace(b'\xa0', b' ') out = out.decode(encoding) #print(out) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return out
def callStanford(self, sentence): encoding = "utf8" cmd = ["java", "-cp", stanford_dir+"/*","-Xmx20g", "edu.stanford.nlp.pipeline.StanfordCoreNLP", "-annotators", "tokenize,ssplit,pos,lemma,depparse", #'-printPCFGkBest', '10', '-outputFormat', 'json', "-parse.flags", "", '-encoding', encoding, '-model', models+'/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ] default_options = ' '.join(_java_options) with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(sentence, compat.text_type) and encoding: input_ = sentence.encode(encoding) input_file.write(input_) input_file.flush() input_file.seek(0) devnull = open(os.devnull, 'w') out = subprocess.check_output(cmd, stdin=input_file, stderr=devnull) out = out.replace(b'\xc2\xa0',b' ') out = out.replace(b'\xa0',b' ') out = out.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return out
def config_weka(classpath=None): global _weka_classpath # Make sure java's configured first. config_java() if classpath is not None: _weka_classpath = classpath if _weka_classpath is None: searchpath = _weka_search if "WEKAHOME" in os.environ: searchpath.insert(0, os.environ["WEKAHOME"]) for path in searchpath: if os.path.exists(os.path.join(path, "weka.jar")): _weka_classpath = os.path.join(path, "weka.jar") version = _check_weka_version(_weka_classpath) if version: print(("[Found Weka: %s (version %s)]" % (_weka_classpath, version))) else: print("[Found Weka: %s]" % _weka_classpath) _check_weka_version(_weka_classpath) if _weka_classpath is None: raise LookupError( "Unable to find weka.jar! Use config_weka() " "or set the WEKAHOME environment variable. " "For more information about Weka, please see " "http://www.cs.waikato.ac.nz/ml/weka/" )
def config_weka(classpath=None): global _weka_classpath # Make sure java's configured first. config_java() if classpath is not None: _weka_classpath = classpath if _weka_classpath is None: searchpath = _weka_search if "WEKAHOME" in os.environ: searchpath.insert(0, os.environ["WEKAHOME"]) for path in searchpath: if os.path.exists(os.path.join(path, "weka.jar")): _weka_classpath = os.path.join(path, "weka.jar") version = _check_weka_version(_weka_classpath) if version: print("[Found Weka: %s (version %s)]" % (_weka_classpath, version)) else: print("[Found Weka: %s]" % _weka_classpath) _check_weka_version(_weka_classpath) if _weka_classpath is None: raise LookupError("Unable to find weka.jar! Use config_weka() " "or set the WEKAHOME environment variable. " "For more information about Weka, please see " "http://www.cs.waikato.ac.nz/ml/weka/")
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(["-charset", encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(["-options", self._options_cmd]) default_options = " ".join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, str) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() cmd.append(input_file.name) # Run the tagger and get the output. stdout, stderr = java( cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE ) stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def _execute(self, cmd, input_, verbose=False): """""" encoding = self._encoding cmd.extend(['-encoding', encoding]) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, compat.text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() cmd.append(input_file.name) # Run the tagger and get the output. stdout, stderr = java(cmd, classpath=(self._stanford_jar, self._model_jar, self._ejml_jar), stdout=PIPE, stderr=PIPE) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def tag_sents(self, sentences): encoding = self._encoding default_options = " ".join(_java_options) config_java(options=self.java_options, verbose=False) # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) cmd = list(self._cmd) cmd.extend(["-encoding", encoding]) # Write the actual sentences to the temporary input file _input_fh = os.fdopen(_input_fh, "wb") _input = "\n".join((" ".join(x) for x in sentences)) if isinstance(_input, str) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() # Run the tagger and get the output stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stanpos_output = stanpos_output.decode(encoding) # Delete the temporary file os.unlink(self._input_file_path) # Return java configurations to their default values config_java(options=default_options, verbose=False) return self.parse_output(stanpos_output, sentences)
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-charset', encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(['-options', self._options_cmd]) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() cmd.append(input_file.name) # Run the tagger and get the output. stdout, stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def callStanford(sentence): # This function can call Stanford CoreNLP tool and support getEvent function. encoding = "utf8" cmd = ["java", "-cp", stanford_dir+"/*","-Xmx20g", "edu.stanford.nlp.pipeline.StanfordCoreNLPClient", "-annotators", "tokenize,ssplit,parse,ner,pos,lemma,depparse", '-outputFormat','json', "-parse.flags", "", '-encoding', encoding, '-model', models+'/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',"-backends","localhost:9001,12"] input_ = "" default_options = ' '.join(_java_options) with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file: # Write the actual sentences to the temporary input file temp_file.write(sentence) temp_file.flush() temp_file.seek(0) devnull = open(os.devnull, 'w') out = subprocess.check_output(cmd, stdin=temp_file, stderr=devnull) out = out.replace(b'\xc2\xa0',b' ') out = out.replace(b'\xa0',b' ') out = out.replace(b'NLP>',b'') out = out.decode(encoding) os.unlink(temp_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return out
def config_weka(classpath=None): global _weka_classpath # Make sure java's configured first. config_java() if classpath is not None: _weka_classpath = classpath if _weka_classpath is None: searchpath = _weka_search if 'WEKAHOME' in os.environ: searchpath.insert(0, os.environ['WEKAHOME']) for path in searchpath: if os.path.exists(os.path.join(path, 'weka.jar')): _weka_classpath = os.path.join(path, 'weka.jar') version = _check_weka_version(_weka_classpath) if version: print ('[Found Weka: %s (version %s)]' % (_weka_classpath, version)) else: print '[Found Weka: %s]' % _weka_classpath _check_weka_version(_weka_classpath) if _weka_classpath is None: raise LookupError('Unable to find weka.jar! Use config_weka() ' 'or set the WEKAHOME environment variable. ' 'For more information about Weka, please see ' 'http://www.cs.waikato.ac.nz/ml/weka/')
def batch_tag(self, sentences): encoding = self._encoding default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=False) # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) if encoding: self._cmd.extend(['-encoding', encoding]) # Write the actual sentences to the temporary input file _input_fh = os.fdopen(_input_fh, 'w') _input = '\n'.join((' '.join(x) for x in sentences)) if isinstance(_input, compat.text_type) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() # Run the tagger and get the output stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar, \ stdout=PIPE, stderr=PIPE) if encoding: stanpos_output = stanpos_output.decode(encoding) # Delete the temporary file os.unlink(self._input_file_path) # Return java configurations to their default values config_java(options=default_options, verbose=False) return self.parse_output(stanpos_output)
def start(self, stdout="devnull", stderr="devnull"): """ Starts the CoreNLP server :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe' """ import requests cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. default_options = " ".join(_java_options) config_java(options=self.java_options, verbose=self.verbose) try: self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout=stdout, stderr=stderr, ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, "Could not start the server. " "The error was: {}".format(stderrdata.decode("ascii")), ) for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, "live")) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError("Could not connect to the server.") for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, "ready")) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError("The server is not ready.")
def start(self): cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=self.verbose) try: # TODO: it's probably a bad idea to pipe stdout, as it will # accumulate when lots of text is being parsed. self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout='pipe', stderr='pipe', ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, 'Could not start the server. ' 'The error was: {}'.format(stderrdata.decode('ascii')) ) for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, 'live')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'Could not connect to the server.' ) for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, 'ready')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'The server is not ready.' )
def __init__(self, model='stanford/models/english-bidirectional-distsim.tagger', libpath='stanford/', verbose=False): self._model = model self._verbose = verbose self._libs = find_jars_within_path(libpath) self._xml_regex = re.compile( r' <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>') config_java(verbose=verbose)
def __init__(self, mate_folder, path_to_model, java_options="-Xmx3G"): self._java_options = java_options self._mate_root = mate_folder self._model = path_to_model self._classpath = "anna-3.61.jar" # Configure java. config_java(options=self._java_options)
def __init__(self): # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html self.additional_properties = { 'tokenize.options': 'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false', 'annotators': 'tokenize, ssplit, pos, lemma' } self.stanford_parser = CoreNLPParser() # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB. internals.config_java(options='-xmx4G')
def __init__(self, model='stanford/models/english-bidirectional-distsim.tagger', libpath='stanford/', verbose=False): self._model = model self._verbose = verbose self._libs = find_jars_within_path(libpath) self._xml_regex = re.compile( r' <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>') config_java(verbose=verbose)
def callStanford(sentence): # This function can call Stanford CoreNLP tool and support getEvent function. encoding = "utf8" cmd = [ "java", "-cp", core_nlp_dir + "/*", "-mx20g", "edu.stanford.nlp.pipeline.StanfordCoreNLPClient", # "-annotators", "tokenize", # 1.34s # "-annotators", "ssplit", # 1.16s # "-annotators", "parse", # (3.65s empty parse), (2.1s empty parse), crash, hang, hang... # "-annotators", "ner", # 14.70, 19.11, 3.31, 6.239s # "-annotators", "pos", # 3.05, 1.4, 1.3s # "-annotators", "lemma", # 1.29s # "-annotators", "depparse", # 28.01, 2.55, 18.57s # "-annotators", "ssplit,tokenize,ner,pos,lemma,depparse", # 6.386, 4.77s # "-annotators", "lemma,ssplit,tokenize,ner,pos,depparse", # 6.386s "-annotators", "tokenize,ssplit,parse,ner,pos,lemma,depparse", # crash, 36, hang,..., 37s '-outputFormat', 'json', "-parse.flags", "", '-encoding', encoding, '-model', models_dir + '/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', "-backends", "localhost:9000" ] default_options = ' '.join(_java_options) with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file: temp_file.write(sentence) temp_file.flush() temp_file.seek(0) err_out = sys.stderr # err_out = open(os.devnull, 'w') # suppress client error noise try: out = subprocess.check_output(cmd, stdin=temp_file, stderr=err_out) except subprocess.CalledProcessError as err: print("openNLP CLIENT ERROR: ", err.errorcode) return out = out.replace(b'\xc2\xa0', b' ') out = out.replace(b'\xa0', b' ') out = out.replace(b'NLP>', b'') out = out.decode(encoding) os.unlink(temp_file.name) config_java(options=default_options, verbose=False) # Return java config to default values return out
def _execute(self, cmd, verbose=False): encoding = self._encoding #cmd.extend(['-inputEncoding', encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(['-options', self._options_cmd]) default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=verbose) # Configure java. stdout, _stderr = java(cmd,classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) config_java(options=default_options, verbose=verbose) # Return java configurations to their default values. return stdout
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-encoding', encoding]) if self.corenlp_options: cmd.append(self.corenlp_options) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() # Run the tagger and get the output. if self._USE_STDIN: input_file.seek(0) stdout, stderr = java( cmd, classpath=self._classpath, stdin=input_file, stdout=PIPE, stderr=PIPE, ) else: cmd.append(input_file.name) stdout, stderr = java(cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE) stdout = stdout.replace(b'\xc2\xa0', b' ') stdout = stdout.replace(b'\x00\xa0', b' ') stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def start(self, stdout='devnull', stderr='devnull'): """ Starts the CoreNLP server :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe' """ cmd = ['edu.stanford.nlp.parser.server.LexicalizedParserServer'] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. # default_options = ' '.join(_java_options) default_options = '' config_java(options=self.java_options, verbose=self.verbose) try: self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout=stdout, stderr=stderr, ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, 'Could not start the server. ' 'The error was: {}'.format(stderrdata.decode('ascii')), ) for i in range(5): try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.connect((self.host, self.port)) except ConnectionRefusedError: time.sleep(1) else: break else: raise CoreNLPServerError('Could not connect to the server.')
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-encoding', encoding]) if self.corenlp_options: cmd.append(self.corenlp_options) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() # Run the tagger and get the output. if self._USE_STDIN: input_file.seek(0) stdout, stderr = java( cmd, classpath=self._classpath, stdin=input_file, stdout=PIPE, stderr=PIPE, ) else: cmd.append(input_file.name) stdout, stderr = java( cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE ) stdout = stdout.replace(b'\xc2\xa0', b' ') stdout = stdout.replace(b'\x00\xa0', b' ') stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def _execute(self, cmd, verbose=False): encoding = self._encoding cmd.extend(['-inputEncoding', encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(['-options', self._options_cmd]) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def _classify_using_weka(self, test_comments, feature_extractor): test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments) temp_dir = tempfile.mkdtemp() self.test_filename = os.path.join(temp_dir, 'test.arff') logger.info('Writing Test WEKA File: ' + self.test_filename) self._write_ARFF_file(self.test_filename, test_set) cmd = [self.javaclass, '-t', self.train_filename, '-T', self.test_filename] + ['-p', '0'] logger.info('Executing WEKA: ' + str(cmd)) config_java(options='-Xmx2000M') (stdout, stderr) = java(cmd, classpath=weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return self.parse_weka_output(stdout.split('\n'))
def tag(self, tokens): _input = ' '.join(tokens).replace('\n', ' ').replace('\r', ' ').replace( '\r\n', ' ').strip() if len(_input) == 0: return [] # Create pipe if not already opened if not self._thread: encoding = self._encoding default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=False) cmd = list(self._cmd) cmd.extend(['-encoding', encoding]) self._child = java(cmd, classpath=self._stanford_jar, stdin='pipe', stdout='pipe', stderr='pipe', blocking=False) self._queue = Queue() self._thread = Thread(target=_enqueue_output, args=(self._child.stdout, self._queue)) self._thread.daemon = True self._thread.start() # clear all newlines, only append one at last for java _input += '\n' self._child.stdin.write(_input.encode('utf-8')) self._child.stdin.flush() try: return self.parse_output( self._queue.get(timeout=120) ) # wait for 2m, usually should return in less than 100ms except Empty: print('stanford postagger timeout, return empty tuple instead', file=sys.stderr) return []
def _classify_using_weka(self, test_comments, feature_extractor): test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments) temp_dir = tempfile.mkdtemp() self.test_filename = os.path.join(temp_dir, 'test.arff') logger.info('Writing Test WEKA File: ' + self.test_filename) self._write_ARFF_file(self.test_filename, test_set) cmd = [ self.javaclass, '-t', self.train_filename, '-T', self.test_filename ] + ['-p', '0'] logger.info('Executing WEKA: ' + str(cmd)) config_java(options='-Xmx2000M') (stdout, stderr) = java(cmd, classpath=weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return self.parse_weka_output(stdout.split('\n'))
def __init__(self, libpath='stanford/', verbose=False): self._verbose = verbose self._libs = find_jars_within_path(libpath) config_java(verbose=verbose)
__author__ = 'E440' import Dir from nltk.internals import java,config_java import subprocess from sys import stdin import re ### weka cmd: ### java weka.classifiers.trees.J48 -p 9 -l directory-path\bank.model -T directory-path \bank-new.arff weka_class_path = Dir.projectDir+"/resources/weka3-6-6.jar" config_java() ### input def weka_classify(arff_file,model_file): class_index=1 if model_file =="": return None with open(arff_file,mode="r",encoding="utf-8") as file: lines = file.readlines() for i in range(lines.__len__()): if "@attribute class" in lines[i]: class_index = i break cmd =["weka.classifiers.trees.RandomForest","-p",str(class_index),"-l",str(model_file),"-T",str(arff_file)] (stdout, stderr) = java(cmd, classpath=weka_class_path,stdout=subprocess.PIPE,stderr=subprocess.PIPE) err_msg = stderr.decode("GBK") if err_msg !="": raise OSError('Java command failed : ' + str(err_msg)) result= stdout.decode(stdin.encoding)
import environment_settings # import nltk.classify # I'm using myutil in place of nltk.classify.util so I can customize the code for my purposes # import nltk.classify.util # for accuracy & log_likelihood if 'JAVAHOME' not in os.environ: os.environ['JAVAHOME'] = os.path.abspath(environment_settings.javahome) # javaw_path = os.path.abspath(os.path.join(os.environ['JAVAHOME'], 'javaw.exe')) # config_java(bin=javaw_path) if environment_settings.javaw_path is not None and os.path.exists( environment_settings.javaw_path): config_java(bin=environment_settings.javaw_path) else: config_java() if 'WEKAHOME' not in os.environ: os.environ['WEKAHOME'] = os.path.abspath(environment_settings.wekahome) import myweka # my modifications made directly to a local copy of the nltk.classify.weka source code import myutil # my modifications made directly to a local copy of the nltk.classify.util source code myweka.config_weka(classpath=os.environ['WEKAHOME']) numeric_classifier_configs = { "RandomSubSpace1": { "name": "RandomSubSpace1", "multifilter_cmd":
def start(self): import requests cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=self.verbose) try: # TODO: it's probably a bad idea to pipe stdout, as it will # accumulate when lots of text is being parsed. self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout='pipe', stderr='pipe', ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, 'Could not start the server. ' 'The error was: {}'.format(stderrdata.decode('ascii')) ) for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, 'live')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'Could not connect to the server.' ) for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, 'ready')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'The server is not ready.' )
import random import subprocess import nltk from nltk.tag.stanford import StanfordPOSTagger as POSTagger from nltk.internals import find_file, find_jar, config_java, java, _java_options config_java(options='-xmx1G') def main(): data_file = open("../data/good_data.txt", "r") out_file = open("../data/good_lines_tags_1.txt", "w") lines = data_file.readlines() data_file.close() line_count = 0 english_postagger = POSTagger( '../postagger/models/english-bidirectional-distsim.tagger', '../postagger/stanford-postagger.jar') for line in lines: tag_list = [] for t in english_postagger.tag(line.split('\n')[0].split(' ')): tag_list.append(t[1]) out_file.write(" ".join(tag_list)) out_file.write("\n") print "completed line" + str(line_count) line_count += 1 out_file.close() if __name__ == '__main__': main()
def tag(sents, java_options='-Xmx1g -XX:ParallelGCThreads=2'): """Tags a sentence using the CMU twitter tokenizer. :param sents: List of sentences to be tagged. The list should contain each sentence as a string. :type sents: list of str """ _root = os.path.join(susx._sussex_root, 'CMU') _cp = '' jars = [ os.path.join(_root, jar) for jar in os.listdir(_root) if jar.endswith('.jar') ] _cp += ';'.join(jars) # write the sentences to the temp file _input_fh, _input_file_path = tempfile.mkstemp(text=True) _input_fh = os.fdopen(_input_fh, 'w') _input = '\n'.join(x.strip() for x in sents if x.strip()) _input_fh.write(_input) _input_fh.close() _output_fh, _output_file_path = tempfile.mkstemp(text=True) # if we're on windows and java hasn't been configured yet if platform.platform().startswith('Windows'): if nltk.internals._java_bin is None: found_java = False for jre_path in _paths: if os.path.exists(jre_path): found_java = True break if found_java: config_java(jre_path, options=java_options, verbose=False) else: raise RuntimeError( 'Can\'t find an installed Java Runtime Environment (JRE).' 'If you have installed java in a non standard location ' 'please call nltk.internals.config_java with the correct ' 'JRE path and options=\'-Xmx1g -XX:ParallelGCThreads=2\' ' 'before calling sussex_nltk.cmu.tag') else: config_java(options=java_options, verbose=False) _cmd = [ 'cmu.arktweetnlp.RunTagger', '--no-confidence', '--output-format', 'conll', _input_file_path ] _dir = os.getcwd() os.chdir(_root) java(_cmd, classpath=_cp, stdout=_output_fh, stderr=subprocess.PIPE) os.chdir(_dir) _output_file = open(_output_file_path, 'r') _output_data = _output_file.read() _output_file.close() os.fdopen(_output_fh).close() os.unlink(_input_file_path) os.unlink(_output_file_path) return _output_data
def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True): """ Execute the given java command, by opening a subprocess that calls `java`. If `java` has not yet been configured, it will be configured by calling `config_java()` with no arguments. :param cmd: The Java command that should be called, formatted as a list of strings. Typically, the first string will be the name of the java class; and the remaining strings will be arguments for that java class. :type cmd: list of str :param classpath: A colon `:` separated list of directories, JAR archives, and ZIP archives to search for class files. :type classpath: str :param str stdin, stdout, stderr: Specify the executed programs' standard input, standard output and standard error file handles, respectively. Valid values are `subprocess.PIPE`, an existing file descriptor (a positive integer), an existing file object, and `None`. `subprocess.PIPE` indicates that a new pipe to the child should be created. With `None`, no redirection will occur; the child's file handles will be inherited from the parent. Additionally, `stderr` can be `subprocess.STDOUT`, which indicates that the `stderr` data from the applications should be captured into the same file handle as for `stdout`. :param bool blocking: If `False`, then return immediately after spawning the subprocess. In this case, the return value is the `Popen` object, and not a `(stdout, stderr)` tuple. :return: If `blocking=True`, then return a tuple `(stdout, stderr)`, containing the `stdout` and `stderr` outputs generated by the `java` command if the `stdout` and `stderr` parameters were set to `subprocess.PIPE`; otherwise returns `None`. If `blocking=False`, then return a `subprocess.Popen` object. :raise OSError: If the java command returns a nonzero return code. """ if stdin == 'pipe': stdin = subprocess.PIPE if stdout == 'pipe': stdout = subprocess.PIPE if stderr == 'pipe': stderr = subprocess.PIPE if isinstance(cmd, str): raise TypeError('cmd should be a list of strings') # Make sure we know where a java binary is. if nltk.internals._java_bin is None: config_java() # Set up the classpath. if platform.platform().startswith('Windows'): _java_cp_sep = ';' else: _java_cp_sep = ':' # Construct the full command string. cmd = list(cmd) cmd = ['-cp', classpath] + cmd cmd = [nltk.internals._java_bin] + nltk.internals._java_options + cmd # Call java via a subprocess p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) if not blocking: return p (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print(stderr) raise OSError('Java command failed!') return (stdout, stderr)
def __init__(self, libpath='stanford/', verbose=False): self._verbose = verbose self._libs = find_jars_within_path(libpath) config_java(verbose=verbose)
# pre-processing utilities from myutils import preprocessor, tagsToString, constructData, debug # Standford POS Tagger # CLASSPATH env-var contains path to JAR from nltk.tag import StanfordPOSTagger from nltk.internals import config_java config = json.load(open('config.json', 'r')) postagpath = config['POS_TAG']['path'] classpath = postagpath + config['POS_TAG']['jar'] modelpath = postagpath + config['POS_TAG']['model'] tagger = StanfordPOSTagger(modelpath, classpath) config_java(options='-Xms4096M -Xmx4096M', verbose=False) tagger_cache = {} unique_tags = [] def findUniqueTags(tags): global unique_tags for t in tags: if t[1] not in unique_tags: unique_tags.append(t[1]) def addToCache(id, wl): global tagger, tagger_cache if tagger_cache.get(id) is not None: return tags = tagger.tag(wl)