Beispiel #1
0
    def train(cls, model_filename, featuresets,
              classifier='naivebayes', options=[], quiet=True):
        # Make sure we can find java & weka.
        config_weka()
        
        # Build an ARFF formatter.
        formatter = ARFF_Formatter.from_train(featuresets)
    
        temp_dir = tempfile.mkdtemp()
        try:
            # Write the training data file.
            train_filename = os.path.join(temp_dir, 'train.arff')
            formatter.write(train_filename, featuresets)

            if classifier in cls._CLASSIFIER_CLASS:
                javaclass = cls._CLASSIFIER_CLASS[classifier]
            elif classifier in cls._CLASSIFIER_CLASS.values():
                javaclass = classifier
            else:
                raise ValueError('Unknown classifier %s' % classifier)
    
            # Train the weka model.
            cmd = [javaclass, '-d', model_filename, '-t', train_filename]
            cmd += list(options)
            if quiet: stdout = subprocess.PIPE
            else: stdout = None
            java(cmd, classpath=_weka_classpath, stdout=stdout)

            # Return the new classifier.
            return WekaClassifier(formatter, model_filename)
        
        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #2
0
    def train(model_filename, featuresets, quiet=True):
        # Make sure we can find java & weka.
        config_weka()
        
        # Build an ARFF formatter.
        formatter = ARFF_Formatter.from_train(featuresets)
    
        temp_dir = tempfile.mkdtemp()
        try:
            # Write the training data file.
            train_filename = os.path.join(temp_dir, 'train.arff')
            formatter.write(train_filename, featuresets)
    
            # Train the weka model.
            cmd = ['weka.classifiers.bayes.NaiveBayes',
                   '-d', model_filename, '-t', train_filename]
            if quiet: stdout = subprocess.PIPE
            else: stdout = None
            java(cmd, classpath=_weka_classpath, stdout=stdout)

            # Return the new classifier.
            return WekaClassifier(formatter, model_filename)
        
        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #3
0
    def _batch_classify(self, featuresets, options):
        # Make sure we can find java & weka.
        config_weka()
        
        temp_dir = tempfile.mkdtemp()
        try:
            # Write the test data file.
            test_filename = os.path.join(temp_dir, 'test.arff')
            self._formatter.write(test_filename, featuresets)
            
            # Call weka to classify the data.
            cmd = ['weka.classifiers.bayes.NaiveBayes', 
                   '-l', self._model, '-T', test_filename] + options
            (stdout, stderr) = java(cmd, classpath=_weka_classpath,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

            # Check if something went wrong:
            if stderr and not stdout:
                if 'Illegal options: -distribution' in stderr:
                    raise ValueError('The installed verison of weka does '
                                     'not support probability distribution '
                                     'output.')
                else:
                    raise ValueError('Weka failed to generate output:\n%s'
                                     % stderr)

            # Parse weka's output.
            return self.parse_weka_output(stdout.split('\n'))

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #4
0
    def _batch_classify(self, featuresets, options):
        # Make sure we can find java & weka.
        config_weka()

        temp_dir = tempfile.mkdtemp()
        try:
            # Write the test data file.
            test_filename = os.path.join(temp_dir, 'test.arff')
            self._formatter.write(test_filename, featuresets)

            # Call weka to classify the data.
            cmd = [
                'weka.classifiers.bayes.NaiveBayes', '-l', self._model, '-T',
                test_filename
            ] + options
            (stdout, stderr) = java(cmd,
                                    classpath=_weka_classpath,
                                    stdout=subprocess.PIPE)

            # Parse weka's output.
            return self.parse_weka_output(stdout.split('\n'))

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #5
0
def weka_classify(arff_file,model_file):
    class_index=1
    if model_file =="":
        return None
    with open(arff_file,mode="r",encoding="utf-8") as file:
        lines = file.readlines()
        for i in range(lines.__len__()):
            if "@attribute class" in lines[i]:
                class_index = i
                break

    cmd =["weka.classifiers.trees.RandomForest","-p",str(class_index),"-l",str(model_file),"-T",str(arff_file)]
    (stdout, stderr)  = java(cmd, classpath=weka_class_path,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    err_msg = stderr.decode("GBK")
    if err_msg !="":
        raise OSError('Java command failed : ' + str(err_msg))
    result= stdout.decode(stdin.encoding)
    if "prediction ()" not in result:
        return None
    result = result[result.index("prediction ()")+"prediction ()".__len__():].strip()
    tmp = result.split("\n")
    final_result ={}
    for t in tmp:
        result_tmp = re.split(" +", t.strip())
        if result_tmp[0] not in final_result.keys():
            predict = result_tmp[2].split(":")[1]
            final_result[result_tmp[0]]= [predict,float(result_tmp[-1])]
    return  final_result
    def _batch_classify(self, featuresets, options):
        # Make sure we can find java & weka.
        config_weka()
        
        temp_dir = tempfile.mkdtemp()
        try:
            # Write the test data file.
            test_filename = os.path.join(temp_dir, 'test.arff')
            self._formatter.write(test_filename, featuresets)
            
            # Call weka to classify the data.
            cmd = ['weka.classifiers.bayes.NaiveBayes', 
                   '-l', self._model, '-T', test_filename] + options
            (stdout, stderr) = java(cmd, classpath=_weka_classpath,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

            # Check if something went wrong:
            if stderr and not stdout:
                if 'Illegal options: -distribution' in stderr:
                    raise ValueError('The installed verison of weka does '
                                     'not support probability distribution '
                                     'output.')
                else:
                    raise ValueError('Weka failed to generate output:\n%s'
                                     % stderr)

            # Parse weka's output.
            return self.parse_weka_output(stdout.split('\n'))

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #7
0
    def tag(self, text, options=['-mx2g']):
        command = ['edu.stanford.nlp.tagger.maxent.MaxentTagger']
        command.extend(['-model', self._model])
        command.extend(['-outputFormat', 'xml'])
        command.extend(['-outputFormatOptions', 'lemmatize'])
        command.extend(options)

        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as text_file:
            text_file.write(text.encode('utf-8'))
            text_file.flush()

            command.extend(['-textFile', text_file.name])

            stderr = subprocess.DEVNULL if not self._verbose else None
            stdout, _ = java(command, classpath=self._libs,
                             stderr=stderr, stdout=subprocess.PIPE)
            output = stdout.decode('utf-8')

        tagged = []
        for line in output.splitlines():
            match = self._xml_regex.fullmatch(line)
            if match:
                tagged.append((match.group(3), match.group(2), match.group(1)))

        return tagged
Beispiel #8
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(['-encoding', encoding])

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb',
                                         delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, compat.text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            cmd.append(input_file.name)

            # Run the tagger and get the output.
            stdout, stderr = java(cmd,
                                  classpath=(self._stanford_jar,
                                             self._model_jar),
                                  stdout=PIPE,
                                  stderr=PIPE)
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #9
0
    def batch_tag(self, sentences):
        encoding = self._encoding
        default_options = ' '.join(_java_options)
        config_java(options=self.java_options, verbose=False)

        # Create a temporary input file
        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)

        if encoding:
            self._cmd.extend(['-encoding', encoding])

        # Write the actual sentences to the temporary input file
        _input_fh = os.fdopen(_input_fh, 'w')
        _input = '\n'.join((' '.join(x) for x in sentences))
        if isinstance(_input, compat.text_type) and encoding:
            _input = _input.encode(encoding)
        _input_fh.write(_input)
        _input_fh.close()

        # Run the tagger and get the output
        stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar, \
                                                       stdout=PIPE, stderr=PIPE)
        if encoding:
            stanpos_output = stanpos_output.decode(encoding)

        # Delete the temporary file
        os.unlink(self._input_file_path)

        # Return java configurations to their default values
        config_java(options=default_options, verbose=False)

        return self.parse_output(stanpos_output)
Beispiel #10
0
    def batch_tag(self, sentences):
        encoding = self._encoding
        default_options = ' '.join(_java_options)
        config_java(options=self.java_options, verbose=False)

        # Create a temporary input file
        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)

        if encoding:
            self._cmd.extend(['-encoding', encoding])

        # Write the actual sentences to the temporary input file
        _input_fh = os.fdopen(_input_fh, 'w')
        _input = '\n'.join((' '.join(x) for x in sentences))
        if isinstance(_input, unicode) and encoding:
            _input = _input.encode(encoding)
        _input_fh.write(_input)
        _input_fh.close()

        # Run the tagger and get the output
        stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar, \
                                                       stdout=PIPE, stderr=PIPE)
        if encoding:
            stanpos_output = stanpos_output.decode(encoding)

        # Delete the temporary file
        os.unlink(self._input_file_path)

        # Return java configurations to their default values
        config_java(options=default_options, verbose=False)

        return self.parse_output(stanpos_output)
Beispiel #11
0
    def tag_sents(self, sentences):
        encoding = self._encoding
        default_options = " ".join(_java_options)
        config_java(options=self.java_options, verbose=False)

        # Create a temporary input file
        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)

        cmd = list(self._cmd)
        cmd.extend(["-encoding", encoding])

        # Write the actual sentences to the temporary input file
        _input_fh = os.fdopen(_input_fh, "wb")
        _input = "\n".join((" ".join(x) for x in sentences))
        if isinstance(_input, str) and encoding:
            _input = _input.encode(encoding)
        _input_fh.write(_input)
        _input_fh.close()

        # Run the tagger and get the output
        stanpos_output, _stderr = java(cmd,
                                       classpath=self._stanford_jar,
                                       stdout=PIPE,
                                       stderr=PIPE)
        stanpos_output = stanpos_output.decode(encoding)

        # Delete the temporary file
        os.unlink(self._input_file_path)

        # Return java configurations to their default values
        config_java(options=default_options, verbose=False)

        return self.parse_output(stanpos_output, sentences)
Beispiel #12
0
Datei: weka.py Projekt: xim/nltk
    def _batch_classify(self, featuresets, options):
        # Make sure we can find java & weka.
        config_weka()

        temp_dir = tempfile.mkdtemp()
        try:
            # Write the test data file.
            test_filename = os.path.join(temp_dir, "test.arff")
            self._formatter.write(test_filename, featuresets)

            # Call weka to classify the data.
            cmd = ["weka.classifiers.bayes.NaiveBayes", "-l", self._model, "-T", test_filename] + options
            (stdout, stderr) = java(cmd, classpath=_weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            # Check if something went wrong:
            if stderr and not stdout:
                if "Illegal options: -distribution" in stderr:
                    raise ValueError(
                        "The installed version of weka does " "not support probability distribution " "output."
                    )
                else:
                    raise ValueError("Weka failed to generate output:\n%s" % stderr)

            # Parse weka's output.
            return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #13
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(['-charset', encoding])
        _options_cmd = self._options_cmd
        if _options_cmd:
            cmd.extend(['-options', self._options_cmd])

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            cmd.append(input_file.name)

            # Run the tagger and get the output.
            stdout, stderr = java(cmd, classpath=self._stanford_jar,
                                  stdout=PIPE, stderr=PIPE)
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #14
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(['-encoding', encoding])
        if self.corenlp_options:
            cmd.append(self.corenlp_options)

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb',
                                         delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            # Run the tagger and get the output.
            if self._USE_STDIN:
                input_file.seek(0)
                stdout, stderr = java(
                    cmd,
                    classpath=self._classpath,
                    stdin=input_file,
                    stdout=PIPE,
                    stderr=PIPE,
                )
            else:
                cmd.append(input_file.name)
                stdout, stderr = java(cmd,
                                      classpath=self._classpath,
                                      stdout=PIPE,
                                      stderr=PIPE)

            stdout = stdout.replace(b'\xc2\xa0', b' ')
            stdout = stdout.replace(b'\x00\xa0', b' ')
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #15
0
    def start(self):
        cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        default_options = ' '.join(_java_options)
        config_java(options=self.java_options, verbose=self.verbose)

        try:
            # TODO: it's probably a bad idea to pipe stdout, as it will
            #       accumulate when lots of text is being parsed.
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout='pipe',
                stderr='pipe',
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                'Could not start the server. '
                'The error was: {}'.format(stderrdata.decode('ascii'))
            )

        for i in range(30):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'live'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'Could not connect to the server.'
            )

        for i in range(60):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'ready'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'The server is not ready.'
            )
Beispiel #16
0
    def start(self, stdout="devnull", stderr="devnull"):
        """ Starts the CoreNLP server

        :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
        """
        import requests

        cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        default_options = " ".join(_java_options)
        config_java(options=self.java_options, verbose=self.verbose)

        try:
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout=stdout,
                stderr=stderr,
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                "Could not start the server. "
                "The error was: {}".format(stderrdata.decode("ascii")),
            )

        for i in range(30):
            try:
                response = requests.get(requests.compat.urljoin(self.url, "live"))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError("Could not connect to the server.")

        for i in range(60):
            try:
                response = requests.get(requests.compat.urljoin(self.url, "ready"))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError("The server is not ready.")
Beispiel #17
0
    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(['-encoding', encoding])
        if self.corenlp_options:
            cmd.append(self.corenlp_options)

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, text_type) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            # Run the tagger and get the output.
            if self._USE_STDIN:
                input_file.seek(0)
                stdout, stderr = java(
                    cmd,
                    classpath=self._classpath,
                    stdin=input_file,
                    stdout=PIPE,
                    stderr=PIPE,
                )
            else:
                cmd.append(input_file.name)
                stdout, stderr = java(
                    cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
                )

            stdout = stdout.replace(b'\xc2\xa0', b' ')
            stdout = stdout.replace(b'\x00\xa0', b' ')
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
 def train(self, file=''):
     if not file:
         file = self.train_file
     command = self.command_line + ' -prop ' + self.prop_file
     cmd = shlex.split(command)
     sout, serr = java(cmd, classpath=self.path_to_jar, stdout=PIPE, stderr=PIPE)
     print '--------------------TRAIN------------------------'
     print serr
     return sout, serr
 def verify(self, file=''):
     if not file:
         file = self.test_file
     command = self.command_line + ' -loadClassifier ' + self.model_filename + ' -testFile ' + file
     cmd = shlex.split(command)
     sout, serr = java(cmd, classpath=self.path_to_jar, stdout=PIPE, stderr=PIPE)
     print '--------------------TEST------------------------'
     print serr
     return sout, serr
Beispiel #20
0
    def train(
        cls,
        model_filename,
        featuresets,
        classifier='naivebayes',
        options=[],
        quiet=True,
    ):
        # Make sure we can find java & weka.
        config_weka()

        # Build an ARFF formatter.
        formatter = ARFF_Formatter.from_train(featuresets)

        temp_dir = tempfile.mkdtemp()
        try:
            # Write the training data file.
            train_filename = os.path.join(temp_dir, 'train.arff')
            formatter.write(train_filename, featuresets)

            if classifier in cls._CLASSIFIER_CLASS:
                javaclass = cls._CLASSIFIER_CLASS[classifier]
            elif classifier in cls._CLASSIFIER_CLASS.values():
                javaclass = classifier
            else:
                raise ValueError('Unknown classifier %s' % classifier)

            # Train the weka model.
            cmd = [javaclass, '-d', model_filename, '-t', train_filename]
            cmd += list(options)
            if quiet:
                stdout = subprocess.PIPE
            else:
                stdout = None
            java(cmd, classpath=_weka_classpath, stdout=stdout)

            # Return the new classifier.
            return WekaClassifier(formatter, model_filename)

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #21
0
    def detokenize(self, text, options=['-mx2g']):
        command = ['edu.stanford.nlp.process.PTBTokenizer', '-untok']
        command.extend(options)

        stderr = subprocess.DEVNULL if not self._verbose else None
        jproc = java(command, classpath=self._libs, blocking=False,
                         stderr=stderr, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
        stdout, _ = jproc.communicate(text.encode('utf-8'))
        output = stdout.decode('utf-8')

        return output
Beispiel #22
0
def call_mxpost(classpath=None, stdin=None, stdout=None, stderr=None,
                blocking=False):
    if not classpath:
        config_mxpost()
    
    if not classpath:
        classpath = _mxpost_classpath
    elif 'mxpost.jar' not in classpath:
        classpath += ':%s' % _mxpost_classpath
    
    cmd = ['tagger.TestTagger', '%s/%s' % (_mxpost_home, 'wsj-02-21.mxpost')]
    return java(cmd, classpath, stdin, stdout, stderr, blocking)
Beispiel #23
0
    def _execute(self, java_class, infile, outfile):
        cwd = os.getcwd()
        os.chdir(self._mate_root)
        os.chdir(self._mate_root)
        cmd = [
            java_class, "-model", self._model, "-test", infile, "-out", outfile
        ]
        stdout, stderr = java(cmd,
                              classpath=self._classpath,
                              stdout=PIPE,
                              stderr=PIPE)
        os.chdir(cwd)

        return stdout, stderr
Beispiel #24
0
    def detokenize(self, text, options=['-mx2g']):
        command = ['edu.stanford.nlp.process.PTBTokenizer', '-untok']
        command.extend(options)

        stderr = subprocess.DEVNULL if not self._verbose else None
        jproc = java(command,
                     classpath=self._libs,
                     blocking=False,
                     stderr=stderr,
                     stdout=subprocess.PIPE,
                     stdin=subprocess.PIPE)
        stdout, _ = jproc.communicate(text.encode('utf-8'))
        output = stdout.decode('utf-8')

        return output
    def _execute(self, cmd, verbose=False):
        encoding = self._encoding
        #cmd.extend(['-inputEncoding', encoding])
        _options_cmd = self._options_cmd
        if _options_cmd:
            cmd.extend(['-options', self._options_cmd])
 
        default_options = ' '.join(_java_options)
 
        config_java(options=self.java_options, verbose=verbose)     # Configure java.
        stdout, _stderr = java(cmd,classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
        stdout = stdout.decode(encoding)
        config_java(options=default_options, verbose=verbose)       # Return java configurations to their default values.
 
        return stdout
Beispiel #26
0
def call_mxpost(classpath=None,
                stdin=None,
                stdout=None,
                stderr=None,
                blocking=False):
    if not classpath:
        config_mxpost()

    if not classpath:
        classpath = _mxpost_classpath
    elif 'mxpost.jar' not in classpath:
        classpath += ':%s' % _mxpost_classpath

    cmd = ['tagger.TestTagger', '%s/%s' % (_mxpost_home, 'wsj-02-21.mxpost')]
    return java(cmd, classpath, stdin, stdout, stderr, blocking)
Beispiel #27
0
    def tag(self, infile, outfile):
        cwd = os.getcwd()
        os.chdir(self._mate_root)
        os.chdir(self._mate_root)
        cmd = [
            "is2.tag.Tagger", "-model", self._model, "-test", infile, "-out",
            outfile
        ]
        stdout, stderr = java(cmd,
                              classpath=self._classpath,
                              stdout=PIPE,
                              stderr=PIPE)
        os.chdir(cwd)

        return stdout, stderr
Beispiel #28
0
    def start(self, stdout='devnull', stderr='devnull'):
        """ Starts the CoreNLP server

        :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
        """

        cmd = ['edu.stanford.nlp.parser.server.LexicalizedParserServer']

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        # default_options = ' '.join(_java_options)
        default_options = ''
        config_java(options=self.java_options, verbose=self.verbose)
        try:
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout=stdout,
                stderr=stderr,
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                'Could not start the server. '
                'The error was: {}'.format(stderrdata.decode('ascii')),
            )

        for i in range(5):
            try:
                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                    s.connect((self.host, self.port))
            except ConnectionRefusedError:
                time.sleep(1)
            else:
                break
        else:
            raise CoreNLPServerError('Could not connect to the server.')
Beispiel #29
0
def call_mallet(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True):
    """
    Call `nltk.internals.java` with the given command, and with the classpath
    modified to include both ``nltk.jar`` and all the ``.jar`` files defined by
    Mallet.

    See `nltk.internals.java` for parameter and return value descriptions.
    """
    if _mallet_classpath is None:
        config_mallet()

    # Set up the classpath
    if classpath is None:
        classpath = _mallet_classpath
    else:
        classpath += os.path.pathsep + _mallet_classpath
    # Delegate to java()
    return java(cmd, classpath, stdin, stdout, stderr, blocking)
Beispiel #30
0
    def _execute(self, cmd, verbose=False):
        encoding = self._encoding
        cmd.extend(['-inputEncoding', encoding])
        _options_cmd = self._options_cmd
        if _options_cmd:
            cmd.extend(['-options', self._options_cmd])

        default_options = ' '.join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
        stdout = stdout.decode(encoding)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout
Beispiel #31
0
    def _classify_many(self, featuresets, options):
        # Make sure we can find java & weka.
        config_weka()

        temp_dir = tempfile.mkdtemp()
        try:
            # Write the test data file.
            test_filename = os.path.join(temp_dir, "test.arff")
            self._formatter.write(test_filename, featuresets)

            # Call weka to classify the data.
            cmd = [
                "weka.classifiers.bayes.NaiveBayes",
                "-l",
                self._model,
                "-T",
                test_filename,
            ] + options
            (stdout, stderr) = java(
                cmd,
                classpath=_weka_classpath,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )

            # Check if something went wrong:
            if stderr and not stdout:
                if "Illegal options: -distribution" in stderr:
                    raise ValueError("The installed version of weka does "
                                     "not support probability distribution "
                                     "output.")
                else:
                    raise ValueError("Weka failed to generate output:\n%s" %
                                     stderr)

            # Parse weka's output.
            return self.parse_weka_output(
                stdout.decode(stdin.encoding).split("\n"))

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #32
0
    def _classify_using_weka(self, test_comments, feature_extractor):
        test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments)
        
        temp_dir = tempfile.mkdtemp()
        self.test_filename = os.path.join(temp_dir, 'test.arff')               
        
        logger.info('Writing Test WEKA File: ' + self.test_filename)
        self._write_ARFF_file(self.test_filename, test_set)

        cmd = [self.javaclass, '-t', self.train_filename, '-T', self.test_filename] + ['-p', '0']
        
        logger.info('Executing WEKA: ' + str(cmd))
        
        config_java(options='-Xmx2000M')
        (stdout, stderr) = java(cmd, classpath=weka_classpath,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
        
        return self.parse_weka_output(stdout.split('\n'))
Beispiel #33
0
def call_mallet(cmd, classpath=None, stdin=None, stdout=None, stderr=None,
                blocking=True):
    """
    Call L{nltk.internals.java()} with the given command, and with the
    classpath modified to include both C{nltk.jar} and all the C{.jar}
    files defined by Mallet.

    See L{nltk.internals.java()} for parameter and return value
    descriptions.
    """
    if _mallet_classpath is None:
        config_mallet()
    
    # Set up the classpath
    if classpath is None:
        classpath = _mallet_classpath
    else:
        classpath += ':' + _mallet_classpath
    # Delegate to java()
    return java(cmd, classpath, stdin, stdout, stderr, blocking)
Beispiel #34
0
    def tag(self, tokens):
        _input = ' '.join(tokens).replace('\n',
                                          ' ').replace('\r', ' ').replace(
                                              '\r\n', ' ').strip()
        if len(_input) == 0:
            return []
        # Create pipe if not already opened
        if not self._thread:
            encoding = self._encoding
            default_options = ' '.join(_java_options)
            config_java(options=self.java_options, verbose=False)

            cmd = list(self._cmd)
            cmd.extend(['-encoding', encoding])

            self._child = java(cmd,
                               classpath=self._stanford_jar,
                               stdin='pipe',
                               stdout='pipe',
                               stderr='pipe',
                               blocking=False)
            self._queue = Queue()
            self._thread = Thread(target=_enqueue_output,
                                  args=(self._child.stdout, self._queue))
            self._thread.daemon = True
            self._thread.start()

        # clear all newlines, only append one at last for java
        _input += '\n'
        self._child.stdin.write(_input.encode('utf-8'))
        self._child.stdin.flush()
        try:
            return self.parse_output(
                self._queue.get(timeout=120)
            )  # wait for 2m, usually should return in less than 100ms
        except Empty:
            print('stanford postagger timeout, return empty tuple instead',
                  file=sys.stderr)
            return []
Beispiel #35
0
    def _classify_using_weka(self, test_comments, feature_extractor):
        test_set = nltk.classify.util.apply_features(feature_extractor.extract,
                                                     test_comments)

        temp_dir = tempfile.mkdtemp()
        self.test_filename = os.path.join(temp_dir, 'test.arff')

        logger.info('Writing Test WEKA File: ' + self.test_filename)
        self._write_ARFF_file(self.test_filename, test_set)

        cmd = [
            self.javaclass, '-t', self.train_filename, '-T', self.test_filename
        ] + ['-p', '0']

        logger.info('Executing WEKA: ' + str(cmd))

        config_java(options='-Xmx2000M')
        (stdout, stderr) = java(cmd,
                                classpath=weka_classpath,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)

        return self.parse_weka_output(stdout.split('\n'))
Beispiel #36
0
    def _batch_classify(self, featuresets, options):
        # Make sure we can find java & weka.
        config_weka()
        
        temp_dir = tempfile.mkdtemp()
        try:
            # Write the test data file.
            test_filename = os.path.join(temp_dir, 'test.arff')
            self._formatter.write(test_filename, featuresets)
            
            # Call weka to classify the data.
            cmd = ['weka.classifiers.bayes.NaiveBayes', 
                   '-l', self._model, '-T', test_filename] + options
            (stdout, stderr) = java(cmd, classpath=_weka_classpath,
                                    stdout=subprocess.PIPE)

            # Parse weka's output.
            return self.parse_weka_output(stdout.split('\n'))

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
Beispiel #37
0
    def _classify_many(self,
                       test_instances_filepath,
                       options,
                       show_stdout=False):
        # Make sure we can find java & weka.
        # config_weka()

        try:
            # We have already written the test instances to a file, whose name was passed in as test_instances_filepath

            # Example (WORKING) cmd for InputMappedClassifier, where the loaded model is a FilteredClassifier
            # weka.classifiers.misc.InputMappedClassifier -I -M -L .\shared\weka_en_conscientious_Filtered.model -T en_test_1094.arff -t en_test_1094.arff -classifications "weka.classifiers.evaluation.output.prediction.CSV –p 1"
            prediction_filepath = test_instances_filepath[:-5] + ".csv"
            options = '-classifications "weka.classifiers.evaluation.output.prediction.CSV –p 1 -file {path}"'.format(
                path=prediction_filepath)
            options = options.split(' ')
            ## INPUTMAPPEDCLASSIFIER
            cmd = [
                "weka.classifiers.misc.InputMappedClassifier", '-I', '-M',
                '-L', self._model, '-T', test_instances_filepath, '-t',
                test_instances_filepath, '-p', '1'
            ]
            # if len(options) > 0:
            # 	cmd = cmd + options
            """
			# Call weka to classify the data.
			if len(options) > 0:
				cmd = [self.classifier, '-l', self._model, '-T', test_instances_filepath, '-p', '0'] + options
			else:
				cmd = [self.classifier, '-l', self._model, '-T', test_instances_filepath, '-p', '0']"""

            if self.classifier in ('weka.classifiers.functions.LibLINEAR',
                                   'weka.classifiers.meta.RotationForest'):
                cmd = [
                    'weka.Run',
                ] + cmd

            # print(cmd)
            print(" ".join(cmd))

            (stdout, stderr) = java(cmd,
                                    classpath=_weka_classpath,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

            # Check if something went wrong:
            if stderr and not stdout:
                print("STDERR was:")
                for line in stderr.decode(stdin.encoding).split('\n'):
                    print(line)
                # if 'Illegal options: -distribution' in stderr:
                # raise ValueError('The installed version of weka does '
                # 'not support probability distribution '
                # 'output.\nSTDERR:\n%s' % stderr)
                # else:
                # raise ValueError('Weka failed to generate output:\n%s'
                # % stderr)
                return None

            stdout_lines = stdout.decode(stdin.encoding).split('\n')
            if show_stdout:
                if len(stdout_lines) <= 20:
                    for line in stdout_lines:
                        print(line)
                else:
                    print("\nFirst 20 lines of stdout were:")
                    for line in stdout_lines[:20]:
                        print(line)
                    print(
                        "  ({0} more lines...)".format(len(stdout_lines) - 20))

            # If we are using the -classifications argument with an output prediction class like weka.classifiers.evaluation.output.prediction.CSV
            # and sending prediction output to a file rather than stdout, don't bother parsing the input here
            if '-classifications' in options and '-file' in options:
                # Strip unwanted text from stdout
                for i, line in enumerate(stdout_lines):
                    if line.strip().startswith("inst#"):
                        stdout_lines = stdout_lines[i:]
                        break
                return [
                    line.strip() for line in stdout_lines
                    if len(line.strip()) > 0 and line[0] != "="
                ]

            # Parse weka's output.
            try:
                output_text = self.parse_weka_output(stdout_lines)
            except ValueError:
                output_text = stdout_lines
            #return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n'))
            return output_text

        finally:
            #for f in os.listdir(temp_dir):
            #	os.remove(os.path.join(temp_dir, f))
            #os.rmdir(temp_dir)
            pass
Beispiel #38
0
    def start(self):
        import requests

        cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']

        if self.corenlp_options:
            cmd.extend(self.corenlp_options)

        # Configure java.
        default_options = ' '.join(_java_options)
        config_java(options=self.java_options, verbose=self.verbose)

        try:
            # TODO: it's probably a bad idea to pipe stdout, as it will
            #       accumulate when lots of text is being parsed.
            self.popen = java(
                cmd,
                classpath=self._classpath,
                blocking=False,
                stdout='pipe',
                stderr='pipe',
            )
        finally:
            # Return java configurations to their default values.
            config_java(options=default_options, verbose=self.verbose)

        # Check that the server is istill running.
        returncode = self.popen.poll()
        if returncode is not None:
            _, stderrdata = self.popen.communicate()
            raise CoreNLPServerError(
                returncode,
                'Could not start the server. '
                'The error was: {}'.format(stderrdata.decode('ascii'))
            )

        for i in range(30):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'live'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'Could not connect to the server.'
            )

        for i in range(60):
            try:
                response = requests.get(requests.compat.urljoin(self.url, 'ready'))
            except requests.exceptions.ConnectionError:
                time.sleep(1)
            else:
                if response.ok:
                    break
        else:
            raise CoreNLPServerError(
                'The server is not ready.'
            )
Beispiel #39
0
def tag(sents, java_options='-Xmx1g -XX:ParallelGCThreads=2'):
    """Tags a sentence using the CMU twitter tokenizer.

    :param sents: List of sentences to be tagged. The list should
        contain each sentence as a string.
    :type sents: list of str
    """

    _root = os.path.join(susx._sussex_root, 'CMU')
    _cp = ''

    jars = [
        os.path.join(_root, jar) for jar in os.listdir(_root)
        if jar.endswith('.jar')
    ]
    _cp += ';'.join(jars)

    # write the sentences to the temp file
    _input_fh, _input_file_path = tempfile.mkstemp(text=True)
    _input_fh = os.fdopen(_input_fh, 'w')
    _input = '\n'.join(x.strip() for x in sents if x.strip())
    _input_fh.write(_input)
    _input_fh.close()

    _output_fh, _output_file_path = tempfile.mkstemp(text=True)
    # if we're on windows and java hasn't been configured yet
    if platform.platform().startswith('Windows'):
        if nltk.internals._java_bin is None:
            found_java = False
            for jre_path in _paths:
                if os.path.exists(jre_path):
                    found_java = True
                    break
            if found_java:
                config_java(jre_path, options=java_options, verbose=False)
            else:
                raise RuntimeError(
                    'Can\'t find an installed Java Runtime Environment (JRE).'
                    'If you have installed java in a non standard location '
                    'please call nltk.internals.config_java with the correct '
                    'JRE path and options=\'-Xmx1g -XX:ParallelGCThreads=2\' '
                    'before calling sussex_nltk.cmu.tag')
    else:
        config_java(options=java_options, verbose=False)

    _cmd = [
        'cmu.arktweetnlp.RunTagger', '--no-confidence', '--output-format',
        'conll', _input_file_path
    ]

    _dir = os.getcwd()
    os.chdir(_root)
    java(_cmd, classpath=_cp, stdout=_output_fh, stderr=subprocess.PIPE)
    os.chdir(_dir)

    _output_file = open(_output_file_path, 'r')
    _output_data = _output_file.read()
    _output_file.close()
    os.fdopen(_output_fh).close()
    os.unlink(_input_file_path)
    os.unlink(_output_file_path)

    return _output_data
Beispiel #40
0
    def train(cls,
              formatter,
              model_filename,
              featuresets,
              all_labels,
              classifier='',
              options=[],
              quiet=True,
              fileprefix=''):
        # Make sure we can find java & weka.
        config_weka()

        #temp_dir = tempfile.mkdtemp()
        try:
            # Write the training data file.
            train_filename = os.path.join(os.curdir, fileprefix + 'train.arff')
            formatter.write(train_filename, featuresets)
            #print("\nWekaClassifier.train wrote training data file:", train_filename)

            # MINE MINE MINE
            if classifier == 'filtered':
                #print('requested filtered classifier, will filter and use mnbtext')
                classifier = 'mnbtext'
                # Run a weka filter on the training data set first
                filter_output = train_filename[:-5] + "_filtered.arff"

                filter_class = "weka.filters.unsupervised.attribute.AddValues"
                filter_cmd = [
                    filter_class,
                    '-C',
                    'last',
                    '-S',
                    '-L',
                    ",".join(all_labels),
                    '-i',
                    train_filename,  # input to the filter
                    '-o',
                    filter_output
                ]
                #print("\nWekaClassifier.train filter_cmd is:", filter_cmd)
                #print(r" ".join(filter_cmd))

                train_filename = filter_output  # use the output of the filter as the input to the classifier

                if quiet: stdout = subprocess.PIPE
                else: stdout = None
                java(filter_cmd, classpath=_weka_classpath, stdout=stdout)

                if classifier in cls._CLASSIFIER_CLASS:
                    javaclass = cls._CLASSIFIER_CLASS[classifier]
                elif classifier in cls._CLASSIFIER_CLASS.values():
                    javaclass = classifier
                else:
                    raise ValueError('Unknown classifier %s' % classifier)

                # Train the weka model.
                options = []
                cmd = [
                    javaclass,
                    '-d',
                    model_filename,  # Sets model output file.
                    '-t',
                    train_filename  # Sets training input data file
                ]
                cmd += list(options)

                #print("\nWekaClassifier.train cmd is:\n", cmd, "\n")
                if quiet: stdout = subprocess.PIPE
                else: stdout = None
                java(cmd, classpath=_weka_classpath, stdout=stdout)

                # Return the new classifier.
                return WekaClassifier(
                    formatter,
                    model_filename,
                    classifier=
                    javaclass  # I added this parameter so the _classify_many function will call the same class of classifier as we trained the model with (otherwise it always used NaiveBayes for some reason)
                )
            else:
                #print('requested classifier', classifier)
                if classifier in cls._CLASSIFIER_CLASS:
                    javaclass = cls._CLASSIFIER_CLASS[classifier]
                elif classifier in cls._CLASSIFIER_CLASS.values():
                    javaclass = classifier
                else:
                    raise ValueError('Unknown classifier %s' % classifier)

                # Train the weka model.
                cmd = [
                    javaclass,
                    '-d',
                    model_filename,  # Sets model output file.
                    '-t',
                    train_filename  # Sets training input data file
                ]
                cmd += list(options)

                #print("\nWekaClassifier.train cmd is:\n", " ".join(cmd), "\n")
                if quiet: stdout = subprocess.PIPE
                else: stdout = None
                java(cmd, classpath=_weka_classpath, stdout=stdout)

                # Return the new classifier.
                return WekaClassifier(
                    formatter,
                    model_filename,
                    classifier=
                    javaclass,  # I added this parameter so the _classify_many function will call the same class of classifier as we trained the model with (otherwise it always used NaiveBayes for some reason)
                    fileprefix=fileprefix)
        finally:
            # for f in os.listdir(temp_dir):
            # os.remove(os.path.join(temp_dir, f))
            # os.rmdir(temp_dir)
            pass