Example #1
0
    def default_config(self, lang):
        """
        Attempt to intialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        """

        search_path = ()
        if os.environ.get('STANFORD_SEGMENTER'):
            search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}

        # init for Chinese-specific files
        self._dict = None
        self._sihan_corpora_dict = None
        self._sihan_post_processing = 'false'

        if lang == 'ar':
            self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
            model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'

        elif lang == 'zh':
            self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
            model = 'pku.gz'
            self._sihan_post_processing = 'true'

            path_to_dict = 'dict-chris6.ser.gz'
            try:
                self._dict = find_file(path_to_dict, searchpath=search_path,
                                       url=_stanford_url, verbose=False,
                                       env_vars=('STANFORD_MODELS',))
            except LookupError:
                raise LookupError("Could not find '%s' (tried using env. "
                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)

            sihan_dir = './data/'
            try:
                path_to_sihan_dir = find_dir(sihan_dir,
                                             url=_stanford_url, verbose=False,
                                             env_vars=('STANFORD_SEGMENTER',))
                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
            except LookupError:
                raise LookupError("Could not find '%s' (tried using the "
                    "STANFORD_SEGMENTER environment variable)" % sihan_dir)
        else:
            raise LookupError("Unsupported language '%'" % lang)

        try:
            self._model = find_file(model, searchpath=search_path,
                                    url=_stanford_url, verbose=False,
                                    env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
        except LookupError:
            raise LookupError("Could not find '%s' (tried using env. "
                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
    def default_config(self, lang):
        """
        Attempt to intialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        """

        search_path = ()
        if os.environ.get('STANFORD_SEGMENTER'):
            search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}

        # init for Chinese-specific files
        self._dict = None
        self._sihan_corpora_dict = None
        self._sihan_post_processing = 'false'

        if lang == 'ar':
            self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
            model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'

        elif lang == 'zh':
            self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
            model = 'pku.gz'
            self._sihan_post_processing = 'true'

            path_to_dict = 'dict-chris6.ser.gz'
            try:
                self._dict = find_file(path_to_dict, searchpath=search_path,
                                       url=_stanford_url, verbose=False,
                                       env_vars=('STANFORD_MODELS',))
            except LookupError:
                raise LookupError("Could not find '%s' (tried using env. "
                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)

            sihan_dir = './data/'
            try:
                path_to_sihan_dir = find_dir(sihan_dir,
                                             url=_stanford_url, verbose=False,
                                             env_vars=('STANFORD_SEGMENTER',))
                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
            except LookupError:
                raise LookupError("Could not find '%s' (tried using the "
                    "STANFORD_SEGMENTER environment variable)" % sihan_dir)
        else:
            raise LookupError("Unsupported language '%'" % lang)

        try:
            self._model = find_file(model, searchpath=search_path,
                                    url=_stanford_url, verbose=False,
                                    env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
        except LookupError:
            raise LookupError("Could not find '%s' (tried using env. "
                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
Example #3
0
    def __init__(
        self,
        model_filename,
        path_to_jar=None,
        encoding="utf8",
        verbose=False,
        java_options="-mx1000m",
    ):
        # Raise deprecation warning.
        warnings.warn(
            str("\nThe StanfordTokenizer will "
                "be deprecated in version 3.2.6.\n"
                "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead."
                ),
            DeprecationWarning,
            stacklevel=2,
        )

        if not self._JAR:
            warnings.warn("The StanfordTagger class is not meant to be "
                          "instantiated directly. Did you mean "
                          "StanfordPOSTagger or StanfordNERTagger?")
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=("STANFORD_MODELS", ),
                                         verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
Example #4
0
    def __init__(self, path_to_model, path_to_bin=None,
                 encoding=_hunpos_charset, verbose=False):
        """
        Starts the hunpos-tag executable and establishes a connection with it.

        :param path_to_model: The model file.
        :param path_to_bin: The hunpos-tag binary.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and tag_sents() methods are converted to
            this charset when they are sent to hunpos-tag.
            The default is ISO-8859-1 (Latin-1).

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        self._closed = True
        hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
                        '/Applications/bin', '~/bin', '~/Applications/bin']
        hunpos_paths = list(map(os.path.expanduser, hunpos_paths))

        self._hunpos_bin = find_binary(
            'hunpos-tag', path_to_bin,
            env_vars=('HUNPOS_TAGGER',),
            searchpath=hunpos_paths,
            url=_hunpos_url,
            verbose=verbose
        )

        self._hunpos_model = find_file(
            path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose)
        self._encoding = encoding
        self._hunpos = Popen([self._hunpos_bin, self._hunpos_model],
                             shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        self._closed = False
Example #5
0
    def __init__(
        self,
        model_filename,
        path_to_jar=None,
        encoding='utf8',
        verbose=False,
        java_options='-mx1000m',
    ):
        # Raise deprecation warning.
        warnings.warn(
            str(
                "\nThe StanfordTokenizer will "
                "be deprecated in version 3.2.6.\n"
                "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead."
            ),
            DeprecationWarning,
            stacklevel=2,
        )

        if not self._JAR:
            warnings.warn(
                'The StanfordTagger class is not meant to be '
                'instantiated directly. Did you mean '
                'StanfordPOSTagger or StanfordNERTagger?'
            )
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose
        )

        self._stanford_model = find_file(
            model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose
        )

        self._encoding = encoding
        self.java_options = java_options
Example #6
0
    def __init__(self,
                 model_filename,
                 path_to_jar=None,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):
        # Raise deprecation warning.
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn(str(
            "\nThe StanfordTokenizer will "
            "be deprecated in version 3.2.5.\n"
            "Please use \033[91mnltk.tag.stanford.CoreNLPPOSTagger\033[0m "
            "or \033[91mnltk.tag.stanford.CoreNLPNERTagger\033[0m instead."),
                      DeprecationWarning,
                      stacklevel=2)
        warnings.simplefilter('ignore', DeprecationWarning)
        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                          'instantiated directly. Did you mean '
                          'StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
Example #7
0
    def __init__(self, path_to_model, path_to_bin=None,
                 encoding=_hunpos_charset, verbose=False):
        """
        Starts the hunpos-tag executable and establishes a connection with it.

        :param path_to_model: The model file.
        :param path_to_bin: The hunpos-tag binary.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and batch_tag() methods are converted to
            this charset when they are sent to hunpos-tag.
            The default is ISO-8859-1 (Latin-1).

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
                        '/Applications/bin', '~/bin', '~/Applications/bin']
        hunpos_paths = map(os.path.expanduser, hunpos_paths)

        self._hunpos_bin = find_binary(
                'hunpos-tag', path_to_bin,
                env_vars=('HUNPOS', 'HUNPOS_HOME'),
                searchpath=hunpos_paths,
                url=_hunpos_url,
                verbose=verbose)

        self._hunpos_model = find_file(path_to_model,
                env_vars=('HUNPOS', 'HUNPOS_HOME'), verbose=verbose)
        self._encoding = encoding
        self._hunpos = Popen([self._hunpos_bin, self._hunpos_model],
                             shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        self._closed = False
Example #8
0
    def __init__(self,
                 model_filename,
                 path_to_jar=None,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn(
                'The StanfordTagger class is not meant to be '
                'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?'
            )
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))

        self._encoding = encoding
        self.java_options = java_options
Example #9
0
def find_malt_model(model_filename):
    """
    A module to find pre-trained MaltParser model.
    """
    if model_filename == None:
        return 'malt_temp.mco'
    elif os.path.exists(model_filename): # If a full path is given.
        return model_filename
    else: # Try to find path to malt model in environment variables.
        return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
Example #10
0
def find_malt_model(model_filename):
    """
    A module to find pre-trained MaltParser model.
    """
    if model_filename == None:
        return 'malt_temp.mco'
    elif os.path.exists(model_filename): # If a full path is given.
        return model_filename
    else: # Try to find path to malt model in environment variables.
        return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
Example #11
0
    def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'):

        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(path_to_model,
                env_vars=('STANFORD_MODELS'), verbose=verbose)
        self._encoding = encoding
        self.java_options = java_options
Example #12
0
    def __init__(self, path_to_model, path_to_jar=None, encoding=None, verbose=False, java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                    'instanciated directly. Did you mean POS- or NERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(path_to_model,
                env_vars=('STANFORD_MODELS'), verbose=verbose)
        self._encoding = encoding
        self.java_options = java_options
Example #13
0
    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                    'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(model_filename,
                env_vars=('STANFORD_MODELS',), verbose=verbose)
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options
    def __init__(self,
                 model_filename=MODEL_FILE_NAME,
                 path_to_jar=PATH_TO_JAR,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                          'instantiated directly. Did you mean '
                          'StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        self._encoding = encoding
        self.java_options = java_options
Example #15
0
    def default_config(self, lang):
        """
        Attempt to intialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        """

        search_path = ()
        if os.environ.get("STANFORD_SEGMENTER"):
            search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}

        # init for Chinese-specific files
        self._dict = None
        self._sihan_corpora_dict = None
        self._sihan_post_processing = "false"

        if lang == "ar":
            self._java_class = (
                "edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
            )
            model = "arabic-segmenter-atb+bn+arztrain.ser.gz"

        elif lang == "zh":
            self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
            model = "pku.gz"
            self._sihan_post_processing = "true"

            path_to_dict = "dict-chris6.ser.gz"
            try:
                self._dict = find_file(
                    path_to_dict,
                    searchpath=search_path,
                    url=_stanford_url,
                    verbose=False,
                    env_vars=("STANFORD_MODELS",),
                )
            except LookupError as e:
                raise LookupError(
                    "Could not find '%s' (tried using env. "
                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
                    % path_to_dict
                ) from e

            sihan_dir = "./data/"
            try:
                path_to_sihan_dir = find_dir(
                    sihan_dir,
                    url=_stanford_url,
                    verbose=False,
                    env_vars=("STANFORD_SEGMENTER",),
                )
                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
            except LookupError as e:
                raise LookupError(
                    "Could not find '%s' (tried using the "
                    "STANFORD_SEGMENTER environment variable)" % sihan_dir
                ) from e
        else:
            raise LookupError("Unsupported language {}".format(lang))

        try:
            self._model = find_file(
                model,
                searchpath=search_path,
                url=_stanford_url,
                verbose=False,
                env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
            )
        except LookupError as e:
            raise LookupError(
                "Could not find '%s' (tried using env. "
                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
            ) from e