Ejemplo n.º 1
0
    def __init__(self,
                 path_to_jar=None,
                 path_to_slf4j=None,
                 java_class=None,
                 path_to_model=None,
                 path_to_dict=None,
                 path_to_sihan_corpora_dict=None,
                 sihan_post_processing='false',
                 keep_whitespaces='false',
                 encoding='UTF-8',
                 options=None,
                 verbose=False,
                 java_options='-mx2g'):
        # Raise deprecation warning.
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn(str(
            "\nThe StanfordTokenizer will "
            "be deprecated in version 3.2.5.\n"
            "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
        ),
                      DeprecationWarning,
                      stacklevel=2)
        warnings.simplefilter('ignore', DeprecationWarning)

        stanford_segmenter = find_jar(self._JAR,
                                      path_to_jar,
                                      env_vars=('STANFORD_SEGMENTER', ),
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)
        if path_to_slf4j is not None:
            slf4j = find_jar('slf4j-api.jar',
                             path_to_slf4j,
                             env_vars=(
                                 'SLF4J',
                                 'STANFORD_SEGMENTER',
                             ),
                             searchpath=(),
                             url=_stanford_url,
                             verbose=verbose)
        else:
            slf4j = None

        # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
        # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
        self._stanford_jar = os.pathsep.join(
            _ for _ in [stanford_segmenter, slf4j] if _ is not None)

        self._java_class = java_class
        self._model = path_to_model
        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._sihan_post_processing = sihan_post_processing
        self._keep_whitespaces = keep_whitespaces
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val))
                                     for key, val in options.items())
Ejemplo n.º 2
0
    def __init__(
        self,
        path_to_jar=None,
        path_to_slf4j=None,
        path_to_sihan_corpora_dict=None,
        path_to_model=None,
        path_to_dict=None,
        encoding="UTF-8",
        options=None,
        verbose=False,
        java_options="-mx2g",
    ):
        stanford_segmenter = find_jar(
            self._JAR, path_to_jar, env_vars=("STANFORD_SEGMENTER",), searchpath=(), url=_stanford_url, verbose=verbose
        )
        slf4j = find_jar(
            self._SLF4J, path_to_slf4j, env_vars=("SLF4J",), searchpath=(), url=_stanford_url, verbose=verbose
        )

        # This is passed to java as the -cp option, the segmenter needs slf4j.
        sep = ";" if os.name == "nt" else ":"
        self._stanford_jar = sep.join([_ for _ in [stanford_segmenter, slf4j] if not _ is None])

        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._model = path_to_model
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ",".join("{0}={1}".format(key, json.dumps(val)) for key, val in options.items())
    def __init__(self, path_to_jar=None, path_to_slf4j=None,
            path_to_sihan_corpora_dict=None,
            path_to_model=None, path_to_dict=None,
            encoding='UTF-8', options=None,
            verbose=False, java_options='-mx2g'):
        stanford_segmenter = find_jar(
                self._JAR, path_to_jar,
                env_vars=('STANFORD_SEGMENTER',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)
        slf4j = find_jar(
                self._SLF4J, path_to_slf4j,
                env_vars=('SLF4J',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        # This is passed to java as the -cp option, the segmenter needs slf4j.
        self._stanford_jar = ':'.join(
            [_ for _ in [stanford_segmenter, slf4j] if not _ is None])

        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._model = path_to_model
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
    def __init__(self, path_to_jar=None, path_to_slf4j=None,
            path_to_sihan_corpora_dict=None,
            path_to_model=None, path_to_dict=None,
            encoding='UTF-8', options=None,
            verbose=False, java_options='-mx2g'):
        stanford_segmenter = find_jar(
                self._JAR, path_to_jar,
                env_vars=('STANFORD_SEGMENTER',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)
        slf4j = find_jar(
                self._SLF4J, path_to_slf4j,
                env_vars=('SLF4J',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        # This is passed to java as the -cp option, the segmenter needs slf4j.
        self._stanford_jar = os.pathsep.join(
            [_ for _ in [stanford_segmenter, slf4j] if not _ is None])

        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._model = path_to_model
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
Ejemplo n.º 5
0
    def __init__(self,
                 path_to_jar=None,
                 path_to_slf4j=None,
                 java_class=None,
                 path_to_model=None,
                 path_to_dict=None,
                 path_to_sihan_corpora_dict=None,
                 sihan_post_processing='false',
                 keep_whitespaces='false',
                 encoding='UTF-8', options=None,
                 verbose=False, java_options='-mx2g'):
        # Raise deprecation warning.
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn(str("\nThe StanfordTokenizer will "
                          "be deprecated in version 3.2.5.\n"
                          "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
                      DeprecationWarning, stacklevel=2)
        warnings.simplefilter('ignore', DeprecationWarning)

        stanford_segmenter = find_jar(
                self._JAR, path_to_jar,
                env_vars=('STANFORD_SEGMENTER',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)
        if path_to_slf4j is not None:
            slf4j = find_jar(
                'slf4j-api.jar', path_to_slf4j,
                env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)
        else:
            slf4j = None

        # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
        # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
        self._stanford_jar = os.pathsep.join(
            _ for _ in [stanford_segmenter, slf4j] if _ is not None
        )

        self._java_class = java_class
        self._model = path_to_model
        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._sihan_post_processing = sihan_post_processing
        self._keep_whitespaces = keep_whitespaces
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
Ejemplo n.º 6
0
    def __init__(self,
                 path_to_jar=None,
                 encoding='utf8',
                 options=None,
                 verbose=False,
                 java_options='-mx1000m'):
        # Raise deprecation warning.
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn(str(
            "\nThe StanfordTokenizer will "
            "be deprecated in version 3.2.5.\n"
            "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
        ),
                      DeprecationWarning,
                      stacklevel=2)
        warnings.simplefilter('ignore', DeprecationWarning)
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      env_vars=('STANFORD_POSTAGGER', ),
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, val)
                                     for key, val in options.items())
Ejemplo n.º 7
0
    def __init__(
        self,
        model_filename,
        path_to_jar=None,
        encoding="utf8",
        verbose=False,
        java_options="-mx1000m",
    ):
        # Raise deprecation warning.
        warnings.warn(
            str("\nThe StanfordTokenizer will "
                "be deprecated in version 3.2.6.\n"
                "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead."
                ),
            DeprecationWarning,
            stacklevel=2,
        )

        if not self._JAR:
            warnings.warn("The StanfordTagger class is not meant to be "
                          "instantiated directly. Did you mean "
                          "StanfordPOSTagger or StanfordNERTagger?")
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=("STANFORD_MODELS", ),
                                         verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 8
0
    def __init__(
        self,
        path_to_jar=None,
        encoding='utf8',
        options=None,
        verbose=False,
        java_options='-mx1000m',
    ):
        # Raise deprecation warning.
        warnings.warn(
            str(
                "\nThe StanfordTokenizer will "
                "be deprecated in version 3.2.5.\n"
                "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'"
            ),
            DeprecationWarning,
            stacklevel=2,
        )

        self._stanford_jar = find_jar(
            self._JAR,
            path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(),
            url=_stanford_url,
            verbose=verbose,
        )

        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join(
            '{0}={1}'.format(key, val) for key, val in options.items()
        )
Ejemplo n.º 9
0
    def __init__(self, path_to_jar=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )

        self._encoding = encoding
        self.java_options = java_options
        #options = {} if options is None else options
        
        # Long Duong : fix bug #735 
        options_str = options
        options = {} 
        if options_str is not None:
            tokens = options_str.split()
            if len(tokens) % 2 !=0:
                    raise ValueError("Must be in set of (argument,value) pair")
            
            for i in range(len(tokens)/2):
                key = tokens[2*i]
                # Work the case when key might contain -  as in -tokenizeNLs
                temp = list(key)
                if temp[0] == '-':
                    key = "".join(temp[1:])
                
                value = tokens[2*i+1]
                options[key] = value 
                
        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
Ejemplo n.º 10
0
    def __init__(self,
                 model_filename,
                 path_to_jar=None,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):
        # Raise deprecation warning.
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn(str(
            "\nThe StanfordTokenizer will "
            "be deprecated in version 3.2.5.\n"
            "Please use \033[91mnltk.tag.stanford.CoreNLPPOSTagger\033[0m "
            "or \033[91mnltk.tag.stanford.CoreNLPNERTagger\033[0m instead."),
                      DeprecationWarning,
                      stacklevel=2)
        warnings.simplefilter('ignore', DeprecationWarning)
        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                          'instantiated directly. Did you mean '
                          'StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 11
0
    def __init__(
        self,
        path_to_jar=None,
        encoding="utf8",
        options=None,
        verbose=False,
        java_options="-mx1000m",
    ):
        # Raise deprecation warning.
        warnings.warn(
            str(
                "\nThe StanfordTokenizer will "
                "be deprecated in version 3.2.5.\n"
                "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'"
            ),
            DeprecationWarning,
            stacklevel=2,
        )

        self._stanford_jar = find_jar(
            self._JAR,
            path_to_jar,
            env_vars=("STANFORD_POSTAGGER",),
            searchpath=(),
            url=_stanford_url,
            verbose=verbose,
        )

        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ",".join(
            "{0}={1}".format(key, val) for key, val in options.items()
        )
Ejemplo n.º 12
0
    def __init__(
        self,
        model_filename,
        path_to_jar=None,
        encoding='utf8',
        verbose=False,
        java_options='-mx1000m',
    ):
        # Raise deprecation warning.
        warnings.warn(
            str(
                "\nThe StanfordTokenizer will "
                "be deprecated in version 3.2.6.\n"
                "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead."
            ),
            DeprecationWarning,
            stacklevel=2,
        )

        if not self._JAR:
            warnings.warn(
                'The StanfordTagger class is not meant to be '
                'instantiated directly. Did you mean '
                'StanfordPOSTagger or StanfordNERTagger?'
            )
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose
        )

        self._stanford_model = find_file(
            model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose
        )

        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 13
0
    def __init__(self, path_to_jar=None, path_to_models_jar=None,
                 model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
                 encoding='utf8', verbose=False, java_options='-mx1000m'):

        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_PARSER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )

        # find the most recent model
        self._model_jar=max(
            find_jar_iter(
                self._MODEL_JAR_PATTERN, path_to_models_jar,
                env_vars=('STANFORD_MODELS',),
                searchpath=(), url=_stanford_url,
                verbose=verbose, is_regex=True
            ),
            key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)
        )

        self.model_path = model_path
        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 14
0
    def __init__(self,
                 model_filename,
                 path_to_jar=None,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn(
                'The StanfordTagger class is not meant to be '
                'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?'
            )
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))

        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 15
0
    def __init__(self,  path_to_jar=None, path_to_models_jar=None, path_to_ejml_jar=None, model_path='edu/stanford/nlp/models/parser/lexparser/englishPCFG.ser.gz', encoding='utf8', verbose=False, java_options='-mx3G'):
        """"""

        self._stanford_jar = find_jar(
          self._JAR, path_to_jar,
          env_vars=('STANFORD_PARSER',),
          searchpath=(), url=_stanford_url,
          verbose=verbose)

        # find the most recent model
        self._model_jar=max(
          find_jar_iter(
            self._MODEL_JAR_PATTERN, path_to_models_jar,
            env_vars=('STANFORD_MODELS',),
            searchpath=(), url=_stanford_url,
            verbose=verbose, is_regex=True),
          key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name))

        # find the most recent ejml
        self._ejml_jar=max(
          find_jar_iter(
            self._EJML_JAR_PATTERN, path_to_ejml_jar,
            env_vars=('STANFORD_EJML',),
            searchpath=(), url=_stanford_url,
            verbose=verbose, is_regex=True),
          key=lambda ejml_name: re.match(self._EJML_JAR_PATTERN, ejml_name))

        self.model_path = model_path
        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 16
0
    def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options="-mx1000m"):

        self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose)

        if not os.path.isfile(path_to_model):
            raise IOError("Stanford tagger model file not found: %s" % path_to_model)
        self._stanford_model = path_to_model
        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 17
0
    def __init__(self,
                 path_to_jar=None,
                 path_to_slf4j=None,
                 java_class=None,
                 path_to_model=None,
                 path_to_dict=None,
                 path_to_sihan_corpora_dict=None,
                 sihan_post_processing='false',
                 keep_whitespaces='false',
                 encoding='UTF-8', options=None,
                 verbose=False, java_options='-mx2g'):

        stanford_segmenter = find_jar(
                self._JAR, path_to_jar,
                env_vars=('STANFORD_SEGMENTER',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)
        if path_to_slf4j is not None:
            slf4j = find_jar(
                'slf4j-api.jar', path_to_slf4j,
                env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)
        else:
            slf4j = None

        # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
        # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
        self._stanford_jar = os.pathsep.join(
            _ for _ in [stanford_segmenter, slf4j] if _ is not None
        )

        self._java_class = java_class
        self._model = path_to_model
        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._sihan_post_processing = sihan_post_processing
        self._keep_whitespaces = keep_whitespaces
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
Ejemplo n.º 18
0
    def __init__(self, path_to_jar=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                env_vars=('STANFORD_POSTAGGER',),
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{}={}'.format(key, json.dumps(val)) for key, val in options.items())
Ejemplo n.º 19
0
    def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'):

        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(path_to_model,
                env_vars=('STANFORD_MODELS'), verbose=verbose)
        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 20
0
    def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                    'instanciated directly. Did you mean POS- or NERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(path_to_model,
                env_vars=('STANFORD_MODELS'), verbose=verbose)
        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 21
0
    def __init__(
        self,
        path_to_jar=None,
        path_to_models_jar=None,
        path_to_ejml_jar=None,
        model_path="edu/stanford/nlp/models/parser/lexparser/englishPCFG.ser.gz",
        encoding="utf8",
        verbose=False,
        java_options="-mx3G",
    ):
        """"""

        self._stanford_jar = find_jar(
            self._JAR, path_to_jar, env_vars=("STANFORD_PARSER",), searchpath=(), url=_stanford_url, verbose=verbose
        )

        # find the most recent model
        self._model_jar = max(
            find_jar_iter(
                self._MODEL_JAR_PATTERN,
                path_to_models_jar,
                env_vars=("STANFORD_MODELS",),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name),
        )

        # find the most recent ejml
        self._ejml_jar = max(
            find_jar_iter(
                self._EJML_JAR_PATTERN,
                path_to_ejml_jar,
                env_vars=("STANFORD_EJML",),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda ejml_name: re.match(self._EJML_JAR_PATTERN, ejml_name),
        )

        self.model_path = model_path
        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 22
0
    def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
Ejemplo n.º 23
0
    def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
Ejemplo n.º 24
0
    def __init__(self):

        baseDir = config.STANFORD_PATH + config.POSTAG
        verbose = True
        _JAR = 'stanford-postagger.jar'

        self._stanford_jar = find_jar(
            _JAR, baseDir + '/' + config.POSTAG_JN,
            env_vars=(),
            searchpath=()
        )
        self._model = '%s/models/chinese-distsim.tagger' % baseDir
        self.java_options='-mx2g'
        self._encoding = config.ENCODE_TYPE

        java_path = "/usr/lib/jvm/java-8/bin/java"
        os.environ['JAVAHOME'] = java_path
Ejemplo n.º 25
0
    def __init__(self,
                 path_to_jar=None,
                 encoding='UTF-8',
                 options=None,
                 verbose=False,
                 java_options='-mx1000m'):
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      env_vars=('STANFORD_POSTAGGER', ),
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{}={}'.format(key, json.dumps(val))
                                     for key, val in options.items())
Ejemplo n.º 26
0
    def __init__(self, path_to_jar=None,
            path_to_model='data/arabic-segmenter-atb+bn+arztrain.ser.gz',
            encoding='UTF-8', options=None,
            verbose=False, java_options='-mx300m'):

        if not os.environ.get('STANFORD_SEGMENTER'):
            os.environ['STANFORD_SEGMENTER'] = 'stanford_segmenter'
            
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                env_vars=('STANFORD_SEGMENTER',),
                searchpath=(STANFORD_SEGMENTER,),
                verbose=verbose
            )
        self._model = STANFORD_SEGMENTER + '/%s' %path_to_model
        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
Ejemplo n.º 27
0
    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                    'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(model_filename,
                env_vars=('STANFORD_MODELS',), verbose=verbose)
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options
Ejemplo n.º 28
0
    def __init__(self, path_to_jar=None,
            path_to_sihan_corpora_dict=None,
            path_to_model=None, path_to_dict=None,
            encoding='UTF-8', options=None,
            verbose=False, java_options='-mx2g'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_SEGMENTER',),
            searchpath=(),
            verbose=verbose
        )
        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._model = path_to_model
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
Ejemplo n.º 29
0
    def __init__(self):

        baseDir = config.STANFORD_PATH + config.SEGMENT
        dataDir = '%s/data' % baseDir
        classifier = '%s/pku.gz' % dataDir
        dicts = '%s/dict-chris6.ser.gz' % dataDir
        verbose = True
        _JAR = 'stanford-segmenter.jar'

        self._stanford_jar = find_jar(
            _JAR, baseDir + '/' + config.SEGMENT_JN,
            env_vars=(),
            searchpath=()
        )
        self._sihan_corpora_dict = dataDir
        self._model = classifier
        self._dict = dicts
        self._encoding = config.ENCODE_TYPE
        self.java_options='-mx2g'

        java_path = "/usr/lib/jvm/java-8/bin/java"
        os.environ['JAVAHOME'] = java_path
Ejemplo n.º 30
0
    def __init__(self,
                 path_to_jar=None,
                 path_to_model='data/arabic-segmenter-atb+bn+arztrain.ser.gz',
                 encoding='UTF-8',
                 options=None,
                 verbose=False,
                 java_options='-mx300m'):

        if not os.environ.get('STANFORD_SEGMENTER'):
            os.environ['STANFORD_SEGMENTER'] = 'stanford_segmenter'

        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      env_vars=('STANFORD_SEGMENTER', ),
                                      searchpath=(STANFORD_SEGMENTER, ),
                                      verbose=verbose)
        self._model = STANFORD_SEGMENTER + '/%s' % path_to_model
        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val))
                                     for key, val in options.items())
    def __init__(self,
                 model_filename=MODEL_FILE_NAME,
                 path_to_jar=PATH_TO_JAR,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                          'instantiated directly. Did you mean '
                          'StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
    def __init__(self,
                 path_to_jar=None,
                 path_to_sihan_corpora_dict=None,
                 path_to_model=None,
                 path_to_dict=None,
                 encoding='UTF-8',
                 options=None,
                 verbose=False,
                 java_options='-mx2g'):
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      env_vars=('STANFORD_SEGMENTER', ),
                                      searchpath=(),
                                      verbose=verbose)
        self._sihan_corpora_dict = path_to_sihan_corpora_dict
        self._model = path_to_model
        self._dict = path_to_dict

        self._encoding = encoding
        self.java_options = java_options
        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val))
                                     for key, val in options.items())