def default_config(self, lang): """ Attempt to intialize Stanford Word Segmenter for the specified language using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables """ search_path = () if os.environ.get('STANFORD_SEGMENTER'): search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')} # init for Chinese-specific files self._dict = None self._sihan_corpora_dict = None self._sihan_post_processing = 'false' if lang == 'ar': self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter' model = 'arabic-segmenter-atb+bn+arztrain.ser.gz' elif lang == 'zh': self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier' model = 'pku.gz' self._sihan_post_processing = 'true' path_to_dict = 'dict-chris6.ser.gz' try: self._dict = find_file(path_to_dict, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=('STANFORD_MODELS',)) except LookupError: raise LookupError("Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict) sihan_dir = './data/' try: path_to_sihan_dir = find_dir(sihan_dir, url=_stanford_url, verbose=False, env_vars=('STANFORD_SEGMENTER',)) self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir) except LookupError: raise LookupError("Could not find '%s' (tried using the " "STANFORD_SEGMENTER environment variable)" % sihan_dir) else: raise LookupError("Unsupported language '%'" % lang) try: self._model = find_file(model, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',)) except LookupError: raise LookupError("Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
def __init__( self, model_filename, path_to_jar=None, encoding="utf8", verbose=False, java_options="-mx1000m", ): # Raise deprecation warning. warnings.warn( str("\nThe StanfordTokenizer will " "be deprecated in version 3.2.6.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead." ), DeprecationWarning, stacklevel=2, ) if not self._JAR: warnings.warn("The StanfordTagger class is not meant to be " "instantiated directly. Did you mean " "StanfordPOSTagger or StanfordNERTagger?") self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=("STANFORD_MODELS", ), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False): """ Starts the hunpos-tag executable and establishes a connection with it. :param path_to_model: The model file. :param path_to_bin: The hunpos-tag binary. :param encoding: The encoding used by the model. Unicode tokens passed to the tag() and tag_sents() methods are converted to this charset when they are sent to hunpos-tag. The default is ISO-8859-1 (Latin-1). This parameter is ignored for str tokens, which are sent as-is. The caller must ensure that tokens are encoded in the right charset. """ self._closed = True hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin', '/Applications/bin', '~/bin', '~/Applications/bin'] hunpos_paths = list(map(os.path.expanduser, hunpos_paths)) self._hunpos_bin = find_binary( 'hunpos-tag', path_to_bin, env_vars=('HUNPOS_TAGGER',), searchpath=hunpos_paths, url=_hunpos_url, verbose=verbose ) self._hunpos_model = find_file( path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose) self._encoding = encoding self._hunpos = Popen([self._hunpos_bin, self._hunpos_model], shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) self._closed = False
def __init__( self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m', ): # Raise deprecation warning. warnings.warn( str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.6.\n" "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead." ), DeprecationWarning, stacklevel=2, ) if not self._JAR: warnings.warn( 'The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean ' 'StanfordPOSTagger or StanfordNERTagger?' ) self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose ) self._stanford_model = find_file( model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose ) self._encoding = encoding self.java_options = java_options
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): # Raise deprecation warning. warnings.simplefilter('always', DeprecationWarning) warnings.warn(str( "\nThe StanfordTokenizer will " "be deprecated in version 3.2.5.\n" "Please use \033[91mnltk.tag.stanford.CoreNLPPOSTagger\033[0m " "or \033[91mnltk.tag.stanford.CoreNLPNERTagger\033[0m instead."), DeprecationWarning, stacklevel=2) warnings.simplefilter('ignore', DeprecationWarning) if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean ' 'StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS', ), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False): """ Starts the hunpos-tag executable and establishes a connection with it. :param path_to_model: The model file. :param path_to_bin: The hunpos-tag binary. :param encoding: The encoding used by the model. Unicode tokens passed to the tag() and batch_tag() methods are converted to this charset when they are sent to hunpos-tag. The default is ISO-8859-1 (Latin-1). This parameter is ignored for str tokens, which are sent as-is. The caller must ensure that tokens are encoded in the right charset. """ hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin', '/Applications/bin', '~/bin', '~/Applications/bin'] hunpos_paths = map(os.path.expanduser, hunpos_paths) self._hunpos_bin = find_binary( 'hunpos-tag', path_to_bin, env_vars=('HUNPOS', 'HUNPOS_HOME'), searchpath=hunpos_paths, url=_hunpos_url, verbose=verbose) self._hunpos_model = find_file(path_to_model, env_vars=('HUNPOS', 'HUNPOS_HOME'), verbose=verbose) self._encoding = encoding self._hunpos = Popen([self._hunpos_bin, self._hunpos_model], shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) self._closed = False
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn( 'The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?' ) self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS', ), verbose=verbose) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options
def find_malt_model(model_filename): """ A module to find pre-trained MaltParser model. """ if model_filename == None: return 'malt_temp.mco' elif os.path.exists(model_filename): # If a full path is given. return model_filename else: # Try to find path to malt model in environment variables. return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(path_to_model, env_vars=('STANFORD_MODELS'), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instanciated directly. Did you mean POS- or NERTagger?') self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(path_to_model, env_vars=('STANFORD_MODELS'), verbose=verbose) self._encoding = encoding self.java_options = java_options
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options
def __init__(self, model_filename=MODEL_FILE_NAME, path_to_jar=PATH_TO_JAR, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean ' 'StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS', ), verbose=verbose) self._encoding = encoding self.java_options = java_options
def default_config(self, lang): """ Attempt to intialize Stanford Word Segmenter for the specified language using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables """ search_path = () if os.environ.get("STANFORD_SEGMENTER"): search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")} # init for Chinese-specific files self._dict = None self._sihan_corpora_dict = None self._sihan_post_processing = "false" if lang == "ar": self._java_class = ( "edu.stanford.nlp.international.arabic.process.ArabicSegmenter" ) model = "arabic-segmenter-atb+bn+arztrain.ser.gz" elif lang == "zh": self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier" model = "pku.gz" self._sihan_post_processing = "true" path_to_dict = "dict-chris6.ser.gz" try: self._dict = find_file( path_to_dict, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=("STANFORD_MODELS",), ) except LookupError as e: raise LookupError( "Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict ) from e sihan_dir = "./data/" try: path_to_sihan_dir = find_dir( sihan_dir, url=_stanford_url, verbose=False, env_vars=("STANFORD_SEGMENTER",), ) self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir) except LookupError as e: raise LookupError( "Could not find '%s' (tried using the " "STANFORD_SEGMENTER environment variable)" % sihan_dir ) from e else: raise LookupError("Unsupported language {}".format(lang)) try: self._model = find_file( model, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"), ) except LookupError as e: raise LookupError( "Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model ) from e