Beispiel #1
0
 def callStanfordNER(self, sentence):
     # This function can call Stanford Name Entity Recognizer and support getEvent function.
     encoding = "utf8"
     default_options = ' '.join(_java_options)
     with tempfile.NamedTemporaryFile(mode='wb',
                                      delete=False) as input_file:
         # Write the actual sentences to the temporary input file
         if isinstance(sentence, compat.text_type) and encoding:
             input_ = sentence.encode(encoding)
         input_file.write(input_)
         input_file.flush()
         input_file.seek(0)
         cmd = [
             "java", "-cp",
             ner_loc + "/stanford-ner.jar:" + ner_loc + "/lib/*", "-Xmx20g",
             "edu.stanford.nlp.ie.crf.CRFClassifier", "-loadClassifier",
             ner_loc + "/classifiers/english.all.3class.distsim.crf.ser.gz",
             '-encoding', encoding, '-textFile', input_file.name,
             "-ner.useSUTime", "false"
         ]
         devnull = open(os.devnull, 'w')
         out = subprocess.check_output(cmd, stderr=devnull)
         #print(out)
         out = out.replace(b'\xc2\xa0', b' ')
         out = out.replace(b'\xa0', b' ')
         out = out.decode(encoding)
         #print(out)
     os.unlink(input_file.name)
     # Return java configurations to their default values.
     config_java(options=default_options, verbose=False)
     return out
Beispiel #2
0
	def callStanford(self, sentence):
		encoding = "utf8"
		cmd = ["java", "-cp", stanford_dir+"/*","-Xmx20g",
			"edu.stanford.nlp.pipeline.StanfordCoreNLP",
			"-annotators", "tokenize,ssplit,pos,lemma,depparse",
			#'-printPCFGkBest', '10',
			'-outputFormat', 'json',
			"-parse.flags", "",
			'-encoding', encoding,
			'-model', models+'/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
		]
		default_options = ' '.join(_java_options)
		with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
			# Write the actual sentences to the temporary input file
			if isinstance(sentence, compat.text_type) and encoding:
				input_ = sentence.encode(encoding)
			input_file.write(input_)
			input_file.flush()
			input_file.seek(0)
			devnull = open(os.devnull, 'w')
			out = subprocess.check_output(cmd, stdin=input_file, stderr=devnull)
			out = out.replace(b'\xc2\xa0',b' ')
			out = out.replace(b'\xa0',b' ')
			out = out.decode(encoding)

		os.unlink(input_file.name)
		# Return java configurations to their default values.
		config_java(options=default_options, verbose=False)
		return out
Beispiel #3
0
Datei: weka.py Projekt: xim/nltk
def config_weka(classpath=None):
    global _weka_classpath

    # Make sure java's configured first.
    config_java()

    if classpath is not None:
        _weka_classpath = classpath

    if _weka_classpath is None:
        searchpath = _weka_search
        if "WEKAHOME" in os.environ:
            searchpath.insert(0, os.environ["WEKAHOME"])

        for path in searchpath:
            if os.path.exists(os.path.join(path, "weka.jar")):
                _weka_classpath = os.path.join(path, "weka.jar")
                version = _check_weka_version(_weka_classpath)
                if version:
                    print(("[Found Weka: %s (version %s)]" % (_weka_classpath, version)))
                else:
                    print("[Found Weka: %s]" % _weka_classpath)
                _check_weka_version(_weka_classpath)

    if _weka_classpath is None:
        raise LookupError(
            "Unable to find weka.jar!  Use config_weka() "
            "or set the WEKAHOME environment variable. "
            "For more information about Weka, please see "
            "http://www.cs.waikato.ac.nz/ml/weka/"
        )
Beispiel #4
0
def config_weka(classpath=None):
    global _weka_classpath

    # Make sure java's configured first.
    config_java()

    if classpath is not None:
        _weka_classpath = classpath

    if _weka_classpath is None:
        searchpath = _weka_search
        if "WEKAHOME" in os.environ:
            searchpath.insert(0, os.environ["WEKAHOME"])

        for path in searchpath:
            if os.path.exists(os.path.join(path, "weka.jar")):
                _weka_classpath = os.path.join(path, "weka.jar")
                version = _check_weka_version(_weka_classpath)
                if version:
                    print("[Found Weka: %s (version %s)]" %
                          (_weka_classpath, version))
                else:
                    print("[Found Weka: %s]" % _weka_classpath)
                _check_weka_version(_weka_classpath)

    if _weka_classpath is None:
        raise LookupError("Unable to find weka.jar!  Use config_weka() "
                          "or set the WEKAHOME environment variable. "
                          "For more information about Weka, please see "
                          "http://www.cs.waikato.ac.nz/ml/weka/")
Beispiel #5
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(["-charset", encoding])
        _options_cmd = self._options_cmd
        if _options_cmd:
            cmd.extend(["-options", self._options_cmd])

        default_options = " ".join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, str) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            cmd.append(input_file.name)

            # Run the tagger and get the output.
            stdout, stderr = java(
                cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
            )
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #6
0
    def _execute(self, cmd, input_, verbose=False):
        """"""

        encoding = self._encoding
        cmd.extend(['-encoding', encoding])

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, compat.text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            cmd.append(input_file.name)

            # Run the tagger and get the output.
            stdout, stderr = java(cmd, classpath=(self._stanford_jar, self._model_jar, self._ejml_jar), stdout=PIPE, stderr=PIPE)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #7
0
    def tag_sents(self, sentences):
        encoding = self._encoding
        default_options = " ".join(_java_options)
        config_java(options=self.java_options, verbose=False)

        # Create a temporary input file
        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)

        cmd = list(self._cmd)
        cmd.extend(["-encoding", encoding])

        # Write the actual sentences to the temporary input file
        _input_fh = os.fdopen(_input_fh, "wb")
        _input = "\n".join((" ".join(x) for x in sentences))
        if isinstance(_input, str) and encoding:
            _input = _input.encode(encoding)
        _input_fh.write(_input)
        _input_fh.close()

        # Run the tagger and get the output
        stanpos_output, _stderr = java(cmd,
                                       classpath=self._stanford_jar,
                                       stdout=PIPE,
                                       stderr=PIPE)
        stanpos_output = stanpos_output.decode(encoding)

        # Delete the temporary file
        os.unlink(self._input_file_path)

        # Return java configurations to their default values
        config_java(options=default_options, verbose=False)

        return self.parse_output(stanpos_output, sentences)
Beispiel #8
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(['-charset', encoding])
        _options_cmd = self._options_cmd
        if _options_cmd:
            cmd.extend(['-options', self._options_cmd])

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            cmd.append(input_file.name)

            # Run the tagger and get the output.
            stdout, stderr = java(cmd, classpath=self._stanford_jar,
                                  stdout=PIPE, stderr=PIPE)
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
def callStanford(sentence):
	# This function can call Stanford CoreNLP tool and support getEvent function.
	encoding = "utf8"
	cmd = ["java", "-cp", stanford_dir+"/*","-Xmx20g", "edu.stanford.nlp.pipeline.StanfordCoreNLPClient",
		"-annotators", "tokenize,ssplit,parse,ner,pos,lemma,depparse",
		'-outputFormat','json',
		"-parse.flags", "",
		'-encoding', encoding,
		'-model', models+'/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',"-backends","localhost:9001,12"]
	input_ = ""
	default_options = ' '.join(_java_options)
	with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:
		# Write the actual sentences to the temporary input file
		temp_file.write(sentence)
		temp_file.flush()
		temp_file.seek(0)
		devnull = open(os.devnull, 'w')
		out = subprocess.check_output(cmd, stdin=temp_file, stderr=devnull)
		out = out.replace(b'\xc2\xa0',b' ')
		out = out.replace(b'\xa0',b' ')
		out = out.replace(b'NLP>',b'')
		out = out.decode(encoding)
	os.unlink(temp_file.name)
	# Return java configurations to their default values.
	config_java(options=default_options, verbose=False)
	return out
Beispiel #10
0
def config_weka(classpath=None):
    global _weka_classpath

    # Make sure java's configured first.
    config_java()
    
    if classpath is not None:
        _weka_classpath = classpath

    if _weka_classpath is None:
        searchpath = _weka_search
        if 'WEKAHOME' in os.environ:
            searchpath.insert(0, os.environ['WEKAHOME'])
        
        for path in searchpath:
            if os.path.exists(os.path.join(path, 'weka.jar')):
                _weka_classpath = os.path.join(path, 'weka.jar')
                version = _check_weka_version(_weka_classpath)
                if version:
                    print ('[Found Weka: %s (version %s)]' %
                           (_weka_classpath, version))
                else:
                    print '[Found Weka: %s]' % _weka_classpath
                _check_weka_version(_weka_classpath)

    if _weka_classpath is None:
        raise LookupError('Unable to find weka.jar!  Use config_weka() '
                          'or set the WEKAHOME environment variable. '
                          'For more information about Weka, please see '
                          'http://www.cs.waikato.ac.nz/ml/weka/')
Beispiel #11
0
    def batch_tag(self, sentences):
        encoding = self._encoding
        default_options = ' '.join(_java_options)
        config_java(options=self.java_options, verbose=False)

        # Create a temporary input file
        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)

        if encoding:
            self._cmd.extend(['-encoding', encoding])

        # Write the actual sentences to the temporary input file
        _input_fh = os.fdopen(_input_fh, 'w')
        _input = '\n'.join((' '.join(x) for x in sentences))
        if isinstance(_input, compat.text_type) and encoding:
            _input = _input.encode(encoding)
        _input_fh.write(_input)
        _input_fh.close()

        # Run the tagger and get the output
        stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar, \
                                                       stdout=PIPE, stderr=PIPE)
        if encoding:
            stanpos_output = stanpos_output.decode(encoding)

        # Delete the temporary file
        os.unlink(self._input_file_path)

        # Return java configurations to their default values
        config_java(options=default_options, verbose=False)

        return self.parse_output(stanpos_output)
Beispiel #12
0
    def start(self, stdout="devnull", stderr="devnull"):
        """ Starts the CoreNLP server

        :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
        """
        import requests

        cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        default_options = " ".join(_java_options)
        config_java(options=self.java_options, verbose=self.verbose)

        try:
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout=stdout,
                stderr=stderr,
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                "Could not start the server. "
                "The error was: {}".format(stderrdata.decode("ascii")),
            )

        for i in range(30):
            try:
                response = requests.get(requests.compat.urljoin(self.url, "live"))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError("Could not connect to the server.")

        for i in range(60):
            try:
                response = requests.get(requests.compat.urljoin(self.url, "ready"))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError("The server is not ready.")
Beispiel #13
0
    def start(self):
        cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        default_options = ' '.join(_java_options)
        config_java(options=self.java_options, verbose=self.verbose)

        try:
            # TODO: it's probably a bad idea to pipe stdout, as it will
            #       accumulate when lots of text is being parsed.
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout='pipe',
                stderr='pipe',
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                'Could not start the server. '
                'The error was: {}'.format(stderrdata.decode('ascii'))
            )

        for i in range(30):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'live'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'Could not connect to the server.'
            )

        for i in range(60):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'ready'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'The server is not ready.'
            )
Beispiel #14
0
    def __init__(self, model='stanford/models/english-bidirectional-distsim.tagger', libpath='stanford/', verbose=False):
        self._model = model
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)
        self._xml_regex = re.compile(
            r'  <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>')

        config_java(verbose=verbose)
Beispiel #15
0
    def __init__(self, mate_folder, path_to_model, java_options="-Xmx3G"):
        self._java_options = java_options
        self._mate_root = mate_folder
        self._model = path_to_model
        self._classpath = "anna-3.61.jar"

        # Configure java.
        config_java(options=self._java_options)
Beispiel #16
0
 def __init__(self):
     # Annotator dependencies, see https://stanfordnlp.github.io/CoreNLP/dependencies.html
     self.additional_properties = {
         'tokenize.options':
         'ptb3Escaping=false, unicodeQuotes=true, splitHyphenated=true, normalizeParentheses=false, normalizeOtherBrackets=false',
         'annotators': 'tokenize, ssplit, pos, lemma'
     }
     self.stanford_parser = CoreNLPParser()
     # The '-xmx2G' changes the maximum allowable RAM to 2GB instead of the default 512MB.
     internals.config_java(options='-xmx4G')
Beispiel #17
0
    def __init__(self,
                 model='stanford/models/english-bidirectional-distsim.tagger',
                 libpath='stanford/',
                 verbose=False):
        self._model = model
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)
        self._xml_regex = re.compile(
            r'  <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>')

        config_java(verbose=verbose)
Beispiel #18
0
def callStanford(sentence):
    # This function can call Stanford CoreNLP tool and support getEvent function.
    encoding = "utf8"
    cmd = [
        "java",
        "-cp",
        core_nlp_dir + "/*",
        "-mx20g",
        "edu.stanford.nlp.pipeline.StanfordCoreNLPClient",
        # "-annotators", "tokenize", # 1.34s
        # "-annotators", "ssplit",   # 1.16s
        # "-annotators", "parse",    # (3.65s empty parse), (2.1s empty parse), crash, hang, hang...
        # "-annotators", "ner",      # 14.70, 19.11, 3.31, 6.239s
        # "-annotators", "pos",      # 3.05, 1.4, 1.3s
        # "-annotators", "lemma",    # 1.29s
        # "-annotators", "depparse", # 28.01, 2.55, 18.57s
        # "-annotators", "ssplit,tokenize,ner,pos,lemma,depparse", # 6.386, 4.77s
        # "-annotators", "lemma,ssplit,tokenize,ner,pos,depparse", # 6.386s
        "-annotators",
        "tokenize,ssplit,parse,ner,pos,lemma,depparse",  # crash, 36, hang,..., 37s
        '-outputFormat',
        'json',
        "-parse.flags",
        "",
        '-encoding',
        encoding,
        '-model',
        models_dir + '/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        "-backends",
        "localhost:9000"
    ]
    default_options = ' '.join(_java_options)
    with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:
        temp_file.write(sentence)
        temp_file.flush()
        temp_file.seek(0)
        err_out = sys.stderr
        # err_out = open(os.devnull, 'w') # suppress client error noise
        try:
            out = subprocess.check_output(cmd, stdin=temp_file, stderr=err_out)
        except subprocess.CalledProcessError as err:
            print("openNLP CLIENT ERROR: ", err.errorcode)
            return
        out = out.replace(b'\xc2\xa0', b' ')
        out = out.replace(b'\xa0', b' ')
        out = out.replace(b'NLP>', b'')
        out = out.decode(encoding)
    os.unlink(temp_file.name)
    config_java(options=default_options,
                verbose=False)  # Return java config to default values
    return out
    def _execute(self, cmd, verbose=False):
        encoding = self._encoding
        #cmd.extend(['-inputEncoding', encoding])
        _options_cmd = self._options_cmd
        if _options_cmd:
            cmd.extend(['-options', self._options_cmd])
 
        default_options = ' '.join(_java_options)
 
        config_java(options=self.java_options, verbose=verbose)     # Configure java.
        stdout, _stderr = java(cmd,classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
        stdout = stdout.decode(encoding)
        config_java(options=default_options, verbose=verbose)       # Return java configurations to their default values.
 
        return stdout
Beispiel #20
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(['-encoding', encoding])
        if self.corenlp_options:
            cmd.append(self.corenlp_options)

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb',
                                         delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            # Run the tagger and get the output.
            if self._USE_STDIN:
                input_file.seek(0)
                stdout, stderr = java(
                    cmd,
                    classpath=self._classpath,
                    stdin=input_file,
                    stdout=PIPE,
                    stderr=PIPE,
                )
            else:
                cmd.append(input_file.name)
                stdout, stderr = java(cmd,
                                      classpath=self._classpath,
                                      stdout=PIPE,
                                      stderr=PIPE)

            stdout = stdout.replace(b'\xc2\xa0', b' ')
            stdout = stdout.replace(b'\x00\xa0', b' ')
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #21
0
    def start(self, stdout='devnull', stderr='devnull'):
        """ Starts the CoreNLP server

        :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
        """

        cmd = ['edu.stanford.nlp.parser.server.LexicalizedParserServer']

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        # default_options = ' '.join(_java_options)
        default_options = ''
        config_java(options=self.java_options, verbose=self.verbose)
        try:
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout=stdout,
                stderr=stderr,
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                'Could not start the server. '
                'The error was: {}'.format(stderrdata.decode('ascii')),
            )

        for i in range(5):
            try:
                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                    s.connect((self.host, self.port))
            except ConnectionRefusedError:
                time.sleep(1)
            else:
                break
        else:
            raise CoreNLPServerError('Could not connect to the server.')
Beispiel #22
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(['-encoding', encoding])
        if self.corenlp_options:
            cmd.append(self.corenlp_options)

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            # Run the tagger and get the output.
            if self._USE_STDIN:
                input_file.seek(0)
                stdout, stderr = java(
                    cmd,
                    classpath=self._classpath,
                    stdin=input_file,
                    stdout=PIPE,
                    stderr=PIPE,
                )
            else:
                cmd.append(input_file.name)
                stdout, stderr = java(
                    cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
                )

            stdout = stdout.replace(b'\xc2\xa0', b' ')
            stdout = stdout.replace(b'\x00\xa0', b' ')
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #23
0
    def _execute(self, cmd, verbose=False):
        encoding = self._encoding
        cmd.extend(['-inputEncoding', encoding])
        _options_cmd = self._options_cmd
        if _options_cmd:
            cmd.extend(['-options', self._options_cmd])

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
        stdout = stdout.decode(encoding)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #24
0
    def _classify_using_weka(self, test_comments, feature_extractor):
        test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments)
        
        temp_dir = tempfile.mkdtemp()
        self.test_filename = os.path.join(temp_dir, 'test.arff')               
        
        logger.info('Writing Test WEKA File: ' + self.test_filename)
        self._write_ARFF_file(self.test_filename, test_set)

        cmd = [self.javaclass, '-t', self.train_filename, '-T', self.test_filename] + ['-p', '0']
        
        logger.info('Executing WEKA: ' + str(cmd))
        
        config_java(options='-Xmx2000M')
        (stdout, stderr) = java(cmd, classpath=weka_classpath,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
        
        return self.parse_weka_output(stdout.split('\n'))
Beispiel #25
0
    def tag(self, tokens):
        _input = ' '.join(tokens).replace('\n',
                                          ' ').replace('\r', ' ').replace(
                                              '\r\n', ' ').strip()
        if len(_input) == 0:
            return []
        # Create pipe if not already opened
        if not self._thread:
            encoding = self._encoding
            default_options = ' '.join(_java_options)
            config_java(options=self.java_options, verbose=False)

            cmd = list(self._cmd)
            cmd.extend(['-encoding', encoding])

            self._child = java(cmd,
                               classpath=self._stanford_jar,
                               stdin='pipe',
                               stdout='pipe',
                               stderr='pipe',
                               blocking=False)
            self._queue = Queue()
            self._thread = Thread(target=_enqueue_output,
                                  args=(self._child.stdout, self._queue))
            self._thread.daemon = True
            self._thread.start()

        # clear all newlines, only append one at last for java
        _input += '\n'
        self._child.stdin.write(_input.encode('utf-8'))
        self._child.stdin.flush()
        try:
            return self.parse_output(
                self._queue.get(timeout=120)
            )  # wait for 2m, usually should return in less than 100ms
        except Empty:
            print('stanford postagger timeout, return empty tuple instead',
                  file=sys.stderr)
            return []
Beispiel #26
0
    def _classify_using_weka(self, test_comments, feature_extractor):
        test_set = nltk.classify.util.apply_features(feature_extractor.extract,
                                                     test_comments)

        temp_dir = tempfile.mkdtemp()
        self.test_filename = os.path.join(temp_dir, 'test.arff')

        logger.info('Writing Test WEKA File: ' + self.test_filename)
        self._write_ARFF_file(self.test_filename, test_set)

        cmd = [
            self.javaclass, '-t', self.train_filename, '-T', self.test_filename
        ] + ['-p', '0']

        logger.info('Executing WEKA: ' + str(cmd))

        config_java(options='-Xmx2000M')
        (stdout, stderr) = java(cmd,
                                classpath=weka_classpath,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)

        return self.parse_weka_output(stdout.split('\n'))
Beispiel #27
0
    def __init__(self, libpath='stanford/', verbose=False):
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)

        config_java(verbose=verbose)
Beispiel #28
0
__author__ = 'E440'
import Dir
from nltk.internals import java,config_java
import subprocess
from sys import stdin
import re

### weka cmd:
### java weka.classifiers.trees.J48 -p 9 -l directory-path\bank.model -T directory-path \bank-new.arff
weka_class_path = Dir.projectDir+"/resources/weka3-6-6.jar"
config_java()


### input
def weka_classify(arff_file,model_file):
    class_index=1
    if model_file =="":
        return None
    with open(arff_file,mode="r",encoding="utf-8") as file:
        lines = file.readlines()
        for i in range(lines.__len__()):
            if "@attribute class" in lines[i]:
                class_index = i
                break

    cmd =["weka.classifiers.trees.RandomForest","-p",str(class_index),"-l",str(model_file),"-T",str(arff_file)]
    (stdout, stderr)  = java(cmd, classpath=weka_class_path,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    err_msg = stderr.decode("GBK")
    if err_msg !="":
        raise OSError('Java command failed : ' + str(err_msg))
    result= stdout.decode(stdin.encoding)
Beispiel #29
0
import environment_settings

# import nltk.classify

# I'm using myutil in place of nltk.classify.util so I can customize the code for my purposes
# import nltk.classify.util # for accuracy & log_likelihood

if 'JAVAHOME' not in os.environ:
    os.environ['JAVAHOME'] = os.path.abspath(environment_settings.javahome)

# javaw_path = os.path.abspath(os.path.join(os.environ['JAVAHOME'], 'javaw.exe'))
# config_java(bin=javaw_path)

if environment_settings.javaw_path is not None and os.path.exists(
        environment_settings.javaw_path):
    config_java(bin=environment_settings.javaw_path)
else:
    config_java()

if 'WEKAHOME' not in os.environ:
    os.environ['WEKAHOME'] = os.path.abspath(environment_settings.wekahome)

import myweka  # my modifications made directly to a local copy of the nltk.classify.weka source code
import myutil  # my modifications made directly to a local copy of the nltk.classify.util source code

myweka.config_weka(classpath=os.environ['WEKAHOME'])

numeric_classifier_configs = {
    "RandomSubSpace1": {
        "name": "RandomSubSpace1",
        "multifilter_cmd":
Beispiel #30
0
    def start(self):
        import requests

        cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        default_options = ' '.join(_java_options)
        config_java(options=self.java_options, verbose=self.verbose)

        try:
            # TODO: it's probably a bad idea to pipe stdout, as it will
            #       accumulate when lots of text is being parsed.
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout='pipe',
                stderr='pipe',
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                'Could not start the server. '
                'The error was: {}'.format(stderrdata.decode('ascii'))
            )

        for i in range(30):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'live'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'Could not connect to the server.'
            )

        for i in range(60):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'ready'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'The server is not ready.'
            )
Beispiel #31
0
import random
import subprocess
import nltk
from nltk.tag.stanford import StanfordPOSTagger as POSTagger
from nltk.internals import find_file, find_jar, config_java, java, _java_options
config_java(options='-xmx1G')


def main():
    data_file = open("../data/good_data.txt", "r")
    out_file = open("../data/good_lines_tags_1.txt", "w")
    lines = data_file.readlines()
    data_file.close()
    line_count = 0
    english_postagger = POSTagger(
        '../postagger/models/english-bidirectional-distsim.tagger',
        '../postagger/stanford-postagger.jar')
    for line in lines:
        tag_list = []
        for t in english_postagger.tag(line.split('\n')[0].split(' ')):
            tag_list.append(t[1])
        out_file.write(" ".join(tag_list))
        out_file.write("\n")
        print "completed line" + str(line_count)
        line_count += 1
    out_file.close()


if __name__ == '__main__':
    main()
Beispiel #32
0
def tag(sents, java_options='-Xmx1g -XX:ParallelGCThreads=2'):
    """Tags a sentence using the CMU twitter tokenizer.

    :param sents: List of sentences to be tagged. The list should
        contain each sentence as a string.
    :type sents: list of str
    """

    _root = os.path.join(susx._sussex_root, 'CMU')
    _cp = ''

    jars = [
        os.path.join(_root, jar) for jar in os.listdir(_root)
        if jar.endswith('.jar')
    ]
    _cp += ';'.join(jars)

    # write the sentences to the temp file
    _input_fh, _input_file_path = tempfile.mkstemp(text=True)
    _input_fh = os.fdopen(_input_fh, 'w')
    _input = '\n'.join(x.strip() for x in sents if x.strip())
    _input_fh.write(_input)
    _input_fh.close()

    _output_fh, _output_file_path = tempfile.mkstemp(text=True)
    # if we're on windows and java hasn't been configured yet
    if platform.platform().startswith('Windows'):
        if nltk.internals._java_bin is None:
            found_java = False
            for jre_path in _paths:
                if os.path.exists(jre_path):
                    found_java = True
                    break
            if found_java:
                config_java(jre_path, options=java_options, verbose=False)
            else:
                raise RuntimeError(
                    'Can\'t find an installed Java Runtime Environment (JRE).'
                    'If you have installed java in a non standard location '
                    'please call nltk.internals.config_java with the correct '
                    'JRE path and options=\'-Xmx1g -XX:ParallelGCThreads=2\' '
                    'before calling sussex_nltk.cmu.tag')
    else:
        config_java(options=java_options, verbose=False)

    _cmd = [
        'cmu.arktweetnlp.RunTagger', '--no-confidence', '--output-format',
        'conll', _input_file_path
    ]

    _dir = os.getcwd()
    os.chdir(_root)
    java(_cmd, classpath=_cp, stdout=_output_fh, stderr=subprocess.PIPE)
    os.chdir(_dir)

    _output_file = open(_output_file_path, 'r')
    _output_data = _output_file.read()
    _output_file.close()
    os.fdopen(_output_fh).close()
    os.unlink(_input_file_path)
    os.unlink(_output_file_path)

    return _output_data
Beispiel #33
0
def java(cmd,
         classpath=None,
         stdin=None,
         stdout=None,
         stderr=None,
         blocking=True):
    """
    Execute the given java command, by opening a subprocess that calls
    `java`.  If `java` has not yet been configured, it will be configured
    by calling `config_java()` with no arguments.

    :param cmd: The Java command that should be called,
        formatted as
        a list of strings.  Typically, the first string will be the name
        of the java class; and the remaining strings will be arguments
        for that java class.
    :type cmd: list of str

    :param classpath: A colon `:` separated list of directories, JAR
        archives, and ZIP archives to search for class files.
    :type classpath: str

    :param str stdin, stdout, stderr: Specify the executed programs'
        standard input, standard output and standard error file
        handles, respectively.  Valid values are `subprocess.PIPE`,
        an existing file descriptor (a positive integer), an existing
        file object, and `None`.  `subprocess.PIPE` indicates that a
        new pipe to the child should be created.  With `None`, no
        redirection will occur; the child's file handles will be
        inherited from the parent.  Additionally, `stderr` can be
        `subprocess.STDOUT`, which indicates that the `stderr` data
        from the applications should be captured into the same file
        handle as for `stdout`.

    :param bool blocking: If `False`, then return immediately after
        spawning the subprocess.  In this case, the return value is
        the `Popen` object, and not a `(stdout, stderr)` tuple.

    :return: If `blocking=True`, then return a tuple `(stdout,
        stderr)`, containing the `stdout` and `stderr` outputs generated
        by the `java` command if the `stdout` and `stderr` parameters
        were set to `subprocess.PIPE`; otherwise returns `None`.  If
        `blocking=False`, then return a `subprocess.Popen` object.

    :raise OSError: If the java command returns a nonzero return code.
    """
    if stdin == 'pipe': stdin = subprocess.PIPE
    if stdout == 'pipe': stdout = subprocess.PIPE
    if stderr == 'pipe': stderr = subprocess.PIPE
    if isinstance(cmd, str):
        raise TypeError('cmd should be a list of strings')

    # Make sure we know where a java binary is.
    if nltk.internals._java_bin is None:
        config_java()

    # Set up the classpath.
    if platform.platform().startswith('Windows'):
        _java_cp_sep = ';'
    else:
        _java_cp_sep = ':'

    # Construct the full command string.
    cmd = list(cmd)
    cmd = ['-cp', classpath] + cmd
    cmd = [nltk.internals._java_bin] + nltk.internals._java_options + cmd

    # Call java via a subprocess
    p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr)
    if not blocking: return p
    (stdout, stderr) = p.communicate()

    # Check the return code.
    if p.returncode != 0:
        print(stderr)
        raise OSError('Java command failed!')

    return (stdout, stderr)
Beispiel #34
0
    def __init__(self, libpath='stanford/', verbose=False):
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)

        config_java(verbose=verbose)
Beispiel #35
0
# pre-processing utilities
from myutils import preprocessor, tagsToString, constructData, debug

# Standford POS Tagger
# CLASSPATH env-var contains path to JAR
from nltk.tag import StanfordPOSTagger
from nltk.internals import config_java

config = json.load(open('config.json', 'r'))

postagpath = config['POS_TAG']['path']
classpath = postagpath + config['POS_TAG']['jar']
modelpath = postagpath + config['POS_TAG']['model']
tagger = StanfordPOSTagger(modelpath, classpath)
config_java(options='-Xms4096M -Xmx4096M', verbose=False)

tagger_cache = {}
unique_tags = []

def findUniqueTags(tags):
    global unique_tags
    for t in tags:
        if t[1] not in unique_tags:
            unique_tags.append(t[1])

def addToCache(id, wl):
    global tagger, tagger_cache
    if tagger_cache.get(id) is not None:
        return
    tags = tagger.tag(wl)