def __init__(self, path_to_jar=None, path_to_slf4j=None, java_class=None, path_to_model=None, path_to_dict=None, path_to_sihan_corpora_dict=None, sihan_post_processing='false', keep_whitespaces='false', encoding='UTF-8', options=None, verbose=False, java_options='-mx2g'): # Raise deprecation warning. warnings.simplefilter('always', DeprecationWarning) warnings.warn(str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'" ), DeprecationWarning, stacklevel=2) warnings.simplefilter('ignore', DeprecationWarning) stanford_segmenter = find_jar(self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER', ), searchpath=(), url=_stanford_url, verbose=verbose) if path_to_slf4j is not None: slf4j = find_jar('slf4j-api.jar', path_to_slf4j, env_vars=( 'SLF4J', 'STANFORD_SEGMENTER', ), searchpath=(), url=_stanford_url, verbose=verbose) else: slf4j = None # This is passed to java as the -cp option, the old version of segmenter needs slf4j. # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j self._stanford_jar = os.pathsep.join( _ for _ in [stanford_segmenter, slf4j] if _ is not None) self._java_class = java_class self._model = path_to_model self._sihan_corpora_dict = path_to_sihan_corpora_dict self._sihan_post_processing = sihan_post_processing self._keep_whitespaces = keep_whitespaces self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__( self, path_to_jar=None, path_to_slf4j=None, path_to_sihan_corpora_dict=None, path_to_model=None, path_to_dict=None, encoding="UTF-8", options=None, verbose=False, java_options="-mx2g", ): stanford_segmenter = find_jar( self._JAR, path_to_jar, env_vars=("STANFORD_SEGMENTER",), searchpath=(), url=_stanford_url, verbose=verbose ) slf4j = find_jar( self._SLF4J, path_to_slf4j, env_vars=("SLF4J",), searchpath=(), url=_stanford_url, verbose=verbose ) # This is passed to java as the -cp option, the segmenter needs slf4j. sep = ";" if os.name == "nt" else ":" self._stanford_jar = sep.join([_ for _ in [stanford_segmenter, slf4j] if not _ is None]) self._sihan_corpora_dict = path_to_sihan_corpora_dict self._model = path_to_model self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ",".join("{0}={1}".format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, path_to_jar=None, path_to_slf4j=None, path_to_sihan_corpora_dict=None, path_to_model=None, path_to_dict=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx2g'): stanford_segmenter = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER',), searchpath=(), url=_stanford_url, verbose=verbose) slf4j = find_jar( self._SLF4J, path_to_slf4j, env_vars=('SLF4J',), searchpath=(), url=_stanford_url, verbose=verbose) # This is passed to java as the -cp option, the segmenter needs slf4j. self._stanford_jar = ':'.join( [_ for _ in [stanford_segmenter, slf4j] if not _ is None]) self._sihan_corpora_dict = path_to_sihan_corpora_dict self._model = path_to_model self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, path_to_jar=None, path_to_slf4j=None, path_to_sihan_corpora_dict=None, path_to_model=None, path_to_dict=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx2g'): stanford_segmenter = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER',), searchpath=(), url=_stanford_url, verbose=verbose) slf4j = find_jar( self._SLF4J, path_to_slf4j, env_vars=('SLF4J',), searchpath=(), url=_stanford_url, verbose=verbose) # This is passed to java as the -cp option, the segmenter needs slf4j. self._stanford_jar = os.pathsep.join( [_ for _ in [stanford_segmenter, slf4j] if not _ is None]) self._sihan_corpora_dict = path_to_sihan_corpora_dict self._model = path_to_model self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, path_to_jar=None, path_to_slf4j=None, java_class=None, path_to_model=None, path_to_dict=None, path_to_sihan_corpora_dict=None, sihan_post_processing='false', keep_whitespaces='false', encoding='UTF-8', options=None, verbose=False, java_options='-mx2g'): # Raise deprecation warning. warnings.simplefilter('always', DeprecationWarning) warnings.warn(str("\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"), DeprecationWarning, stacklevel=2) warnings.simplefilter('ignore', DeprecationWarning) stanford_segmenter = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER',), searchpath=(), url=_stanford_url, verbose=verbose) if path_to_slf4j is not None: slf4j = find_jar( 'slf4j-api.jar', path_to_slf4j, env_vars=('SLF4J', 'STANFORD_SEGMENTER',), searchpath=(), url=_stanford_url, verbose=verbose) else: slf4j = None # This is passed to java as the -cp option, the old version of segmenter needs slf4j. # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j self._stanford_jar = os.pathsep.join( _ for _ in [stanford_segmenter, slf4j] if _ is not None ) self._java_class = java_class self._model = path_to_model self._sihan_corpora_dict = path_to_sihan_corpora_dict self._sihan_post_processing = sihan_post_processing self._keep_whitespaces = keep_whitespaces self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'): # Raise deprecation warning. warnings.simplefilter('always', DeprecationWarning) warnings.warn(str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'" ), DeprecationWarning, stacklevel=2) warnings.simplefilter('ignore', DeprecationWarning) self._stanford_jar = find_jar(self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER', ), searchpath=(), url=_stanford_url, verbose=verbose) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
def __init__( self, model_filename, path_to_jar=None, encoding="utf8", verbose=False, java_options="-mx1000m", ): # Raise deprecation warning. warnings.warn( str("\nThe StanfordTokenizer will " "be deprecated in version 3.2.6.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead." ), DeprecationWarning, stacklevel=2, ) if not self._JAR: warnings.warn("The StanfordTagger class is not meant to be " "instantiated directly. Did you mean " "StanfordPOSTagger or StanfordNERTagger?") self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=("STANFORD_MODELS", ), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__( self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m', ): # Raise deprecation warning. warnings.warn( str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'" ), DeprecationWarning, stacklevel=2, ) self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER',), searchpath=(), url=_stanford_url, verbose=verbose, ) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join( '{0}={1}'.format(key, val) for key, val in options.items() )
def __init__(self, path_to_jar=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER',), searchpath=(), url=_stanford_url, verbose=verbose ) self._encoding = encoding self.java_options = java_options #options = {} if options is None else options # Long Duong : fix bug #735 options_str = options options = {} if options_str is not None: tokens = options_str.split() if len(tokens) % 2 !=0: raise ValueError("Must be in set of (argument,value) pair") for i in range(len(tokens)/2): key = tokens[2*i] # Work the case when key might contain - as in -tokenizeNLs temp = list(key) if temp[0] == '-': key = "".join(temp[1:]) value = tokens[2*i+1] options[key] = value self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): # Raise deprecation warning. warnings.simplefilter('always', DeprecationWarning) warnings.warn(str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.tag.stanford.CoreNLPPOSTagger\033[0m " "or \033[91mnltk.tag.stanford.CoreNLPNERTagger\033[0m instead."), DeprecationWarning, stacklevel=2) warnings.simplefilter('ignore', DeprecationWarning) if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean ' 'StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS', ), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__( self, path_to_jar=None, encoding="utf8", options=None, verbose=False, java_options="-mx1000m", ): # Raise deprecation warning. warnings.warn( str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'" ), DeprecationWarning, stacklevel=2, ) self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=("STANFORD_POSTAGGER",), searchpath=(), url=_stanford_url, verbose=verbose, ) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ",".join( "{0}={1}".format(key, val) for key, val in options.items() )
def __init__( self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m', ): # Raise deprecation warning. warnings.warn( str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.6.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead." ), DeprecationWarning, stacklevel=2, ) if not self._JAR: warnings.warn( 'The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean ' 'StanfordPOSTagger or StanfordNERTagger?' ) self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose ) self._stanford_model = find_file( model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose ) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_jar=None, path_to_models_jar=None, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8', verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_PARSER',), searchpath=(), url=_stanford_url, verbose=verbose ) # find the most recent model self._model_jar=max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=('STANFORD_MODELS',), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True ), key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name) ) self.model_path = model_path self._encoding = encoding self.java_options = java_options
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn( 'The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?' ) self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS', ), verbose=verbose) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_jar=None, path_to_models_jar=None, path_to_ejml_jar=None, model_path='edu/stanford/nlp/models/parser/lexparser/englishPCFG.ser.gz', encoding='utf8', verbose=False, java_options='-mx3G'): """""" self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_PARSER',), searchpath=(), url=_stanford_url, verbose=verbose) # find the most recent model self._model_jar=max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=('STANFORD_MODELS',), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True), key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)) # find the most recent ejml self._ejml_jar=max( find_jar_iter( self._EJML_JAR_PATTERN, path_to_ejml_jar, env_vars=('STANFORD_EJML',), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True), key=lambda ejml_name: re.match(self._EJML_JAR_PATTERN, ejml_name)) self.model_path = model_path self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options="-mx1000m"): self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) if not os.path.isfile(path_to_model): raise IOError("Stanford tagger model file not found: %s" % path_to_model) self._stanford_model = path_to_model self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_jar=None, path_to_slf4j=None, java_class=None, path_to_model=None, path_to_dict=None, path_to_sihan_corpora_dict=None, sihan_post_processing='false', keep_whitespaces='false', encoding='UTF-8', options=None, verbose=False, java_options='-mx2g'): stanford_segmenter = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER',), searchpath=(), url=_stanford_url, verbose=verbose) if path_to_slf4j is not None: slf4j = find_jar( 'slf4j-api.jar', path_to_slf4j, env_vars=('SLF4J', 'STANFORD_SEGMENTER',), searchpath=(), url=_stanford_url, verbose=verbose) else: slf4j = None # This is passed to java as the -cp option, the old version of segmenter needs slf4j. # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j self._stanford_jar = os.pathsep.join( _ for _ in [stanford_segmenter, slf4j] if _ is not None ) self._java_class = java_class self._model = path_to_model self._sihan_corpora_dict = path_to_sihan_corpora_dict self._sihan_post_processing = sihan_post_processing self._keep_whitespaces = keep_whitespaces self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, path_to_jar=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER',), searchpath=(), url=_stanford_url, verbose=verbose) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{}={}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(path_to_model, env_vars=('STANFORD_MODELS'), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instanciated directly. Did you mean POS- or NERTagger?') self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(path_to_model, env_vars=('STANFORD_MODELS'), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__( self, path_to_jar=None, path_to_models_jar=None, path_to_ejml_jar=None, model_path="edu/stanford/nlp/models/parser/lexparser/englishPCFG.ser.gz", encoding="utf8", verbose=False, java_options="-mx3G", ): """""" self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=("STANFORD_PARSER",), searchpath=(), url=_stanford_url, verbose=verbose ) # find the most recent model self._model_jar = max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=("STANFORD_MODELS",), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ), key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name), ) # find the most recent ejml self._ejml_jar = max( find_jar_iter( self._EJML_JAR_PATTERN, path_to_ejml_jar, env_vars=("STANFORD_EJML",), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ), key=lambda ejml_name: re.match(self._EJML_JAR_PATTERN, ejml_name), ) self.model_path = model_path self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER',), searchpath=(), url=_stanford_url, verbose=verbose ) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
def __init__(self): baseDir = config.STANFORD_PATH + config.POSTAG verbose = True _JAR = 'stanford-postagger.jar' self._stanford_jar = find_jar( _JAR, baseDir + '/' + config.POSTAG_JN, env_vars=(), searchpath=() ) self._model = '%s/models/chinese-distsim.tagger' % baseDir self.java_options='-mx2g' self._encoding = config.ENCODE_TYPE java_path = "/usr/lib/jvm/java-8/bin/java" os.environ['JAVAHOME'] = java_path
def __init__(self, path_to_jar=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar(self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER', ), searchpath=(), url=_stanford_url, verbose=verbose) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{}={}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, path_to_jar=None, path_to_model='data/arabic-segmenter-atb+bn+arztrain.ser.gz', encoding='UTF-8', options=None, verbose=False, java_options='-mx300m'): if not os.environ.get('STANFORD_SEGMENTER'): os.environ['STANFORD_SEGMENTER'] = 'stanford_segmenter' self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER',), searchpath=(STANFORD_SEGMENTER,), verbose=verbose ) self._model = STANFORD_SEGMENTER + '/%s' %path_to_model self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_jar=None, path_to_sihan_corpora_dict=None, path_to_model=None, path_to_dict=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx2g'): self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER',), searchpath=(), verbose=verbose ) self._sihan_corpora_dict = path_to_sihan_corpora_dict self._model = path_to_model self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self): baseDir = config.STANFORD_PATH + config.SEGMENT dataDir = '%s/data' % baseDir classifier = '%s/pku.gz' % dataDir dicts = '%s/dict-chris6.ser.gz' % dataDir verbose = True _JAR = 'stanford-segmenter.jar' self._stanford_jar = find_jar( _JAR, baseDir + '/' + config.SEGMENT_JN, env_vars=(), searchpath=() ) self._sihan_corpora_dict = dataDir self._model = classifier self._dict = dicts self._encoding = config.ENCODE_TYPE self.java_options='-mx2g' java_path = "/usr/lib/jvm/java-8/bin/java" os.environ['JAVAHOME'] = java_path
def __init__(self, path_to_jar=None, path_to_model='data/arabic-segmenter-atb+bn+arztrain.ser.gz', encoding='UTF-8', options=None, verbose=False, java_options='-mx300m'): if not os.environ.get('STANFORD_SEGMENTER'): os.environ['STANFORD_SEGMENTER'] = 'stanford_segmenter' self._stanford_jar = find_jar(self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER', ), searchpath=(STANFORD_SEGMENTER, ), verbose=verbose) self._model = STANFORD_SEGMENTER + '/%s' % path_to_model self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def __init__(self, model_filename=MODEL_FILE_NAME, path_to_jar=PATH_TO_JAR, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean ' 'StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS', ), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_jar=None, path_to_sihan_corpora_dict=None, path_to_model=None, path_to_dict=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx2g'): self._stanford_jar = find_jar(self._JAR, path_to_jar, env_vars=('STANFORD_SEGMENTER', ), searchpath=(), verbose=verbose) self._sihan_corpora_dict = path_to_sihan_corpora_dict self._model = path_to_model self._dict = path_to_dict self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())