Esempio n. 1
0
File: repp.py Progetto: DrDub/nltk
 def find_repptokenizer(self, repp_dirname):
     """
     A module to find REPP tokenizer binary and its *repp.set* config file.
     """
     if os.path.exists(repp_dirname): # If a full path is given.
         _repp_dir = repp_dirname
     else: # Try to find path to REPP directory in environment variables.
         _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',))
     # Checks for the REPP binary and erg/repp.set config file.
     assert os.path.exists(_repp_dir+'/src/repp')
     assert os.path.exists(_repp_dir+'/erg/repp.set')
     return _repp_dir
Esempio n. 2
0
 def find_repptokenizer(self, repp_dirname):
     """
     A module to find REPP tokenizer binary and its *repp.set* config file.
     """
     if os.path.exists(repp_dirname):  # If a full path is given.
         _repp_dir = repp_dirname
     else:  # Try to find path to REPP directory in environment variables.
         _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
     # Checks for the REPP binary and erg/repp.set config file.
     assert os.path.exists(_repp_dir + "/src/repp")
     assert os.path.exists(_repp_dir + "/erg/repp.set")
     return _repp_dir
    def default_config(self, lang):
        """
        Attempt to intialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        """

        search_path = ()
        if os.environ.get('STANFORD_SEGMENTER'):
            search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}

        # init for Chinese-specific files
        self._dict = None
        self._sihan_corpora_dict = None
        self._sihan_post_processing = 'false'

        if lang == 'ar':
            self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
            model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'

        elif lang == 'zh':
            self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
            model = 'pku.gz'
            self._sihan_post_processing = 'true'

            path_to_dict = 'dict-chris6.ser.gz'
            try:
                self._dict = find_file(path_to_dict, searchpath=search_path,
                                       url=_stanford_url, verbose=False,
                                       env_vars=('STANFORD_MODELS',))
            except LookupError:
                raise LookupError("Could not find '%s' (tried using env. "
                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)

            sihan_dir = './data/'
            try:
                path_to_sihan_dir = find_dir(sihan_dir,
                                             url=_stanford_url, verbose=False,
                                             env_vars=('STANFORD_SEGMENTER',))
                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
            except LookupError:
                raise LookupError("Could not find '%s' (tried using the "
                    "STANFORD_SEGMENTER environment variable)" % sihan_dir)
        else:
            raise LookupError("Unsupported language '%'" % lang)

        try:
            self._model = find_file(model, searchpath=search_path,
                                    url=_stanford_url, verbose=False,
                                    env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
        except LookupError:
            raise LookupError("Could not find '%s' (tried using env. "
                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
Esempio n. 4
0
    def default_config(self, lang):
        """
        Attempt to intialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        """

        search_path = ()
        if os.environ.get('STANFORD_SEGMENTER'):
            search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}

        # init for Chinese-specific files
        self._dict = None
        self._sihan_corpora_dict = None
        self._sihan_post_processing = 'false'

        if lang == 'ar':
            self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
            model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'

        elif lang == 'zh':
            self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
            model = 'pku.gz'
            self._sihan_post_processing = 'true'

            path_to_dict = 'dict-chris6.ser.gz'
            try:
                self._dict = find_file(path_to_dict, searchpath=search_path,
                                       url=_stanford_url, verbose=False,
                                       env_vars=('STANFORD_MODELS',))
            except LookupError:
                raise LookupError("Could not find '%s' (tried using env. "
                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)

            sihan_dir = './data/'
            try:
                path_to_sihan_dir = find_dir(sihan_dir,
                                             url=_stanford_url, verbose=False,
                                             env_vars=('STANFORD_SEGMENTER',))
                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
            except LookupError:
                raise LookupError("Could not find '%s' (tried using the "
                    "STANFORD_SEGMENTER environment variable)" % sihan_dir)
        else:
            raise LookupError("Unsupported language '%'" % lang)

        try:
            self._model = find_file(model, searchpath=search_path,
                                    url=_stanford_url, verbose=False,
                                    env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
        except LookupError:
            raise LookupError("Could not find '%s' (tried using env. "
                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
Esempio n. 5
0
def find_maltparser(parser_dirname):
    """
    A module to find MaltParser .jar file and its dependencies.
    """
    if os.path.exists(parser_dirname): # If a full path is given.
        _malt_dir = parser_dirname
    else: # Try to find path to maltparser directory in environment variables.
        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
    # Checks that that the found directory contains all the necessary .jar
    malt_dependencies = ['','','']
    _malt_jars = set(find_jars_within_path(_malt_dir))
    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])

    assert malt_dependencies.issubset(_jars)
    assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
    return list(_malt_jars)
Esempio n. 6
0
def find_maltparser(parser_dirname):
    """
    A module to find MaltParser .jar file and its dependencies.
    """
    if os.path.exists(parser_dirname): # If a full path is given.
        _malt_dir = parser_dirname
    else: # Try to find path to maltparser directory in environment variables.
        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
    # Checks that that the found directory contains all the necessary .jar
    malt_dependencies = ['','','']
    _malt_jars = set(find_jars_within_path(_malt_dir))
    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])

    assert malt_dependencies.issubset(_jars)
    assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
    return list(_malt_jars)
Esempio n. 7
0
    def default_config(self, lang):
        """
        Attempt to intialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        """

        search_path = ()
        if os.environ.get("STANFORD_SEGMENTER"):
            search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}

        # init for Chinese-specific files
        self._dict = None
        self._sihan_corpora_dict = None
        self._sihan_post_processing = "false"

        if lang == "ar":
            self._java_class = (
                "edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
            )
            model = "arabic-segmenter-atb+bn+arztrain.ser.gz"

        elif lang == "zh":
            self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
            model = "pku.gz"
            self._sihan_post_processing = "true"

            path_to_dict = "dict-chris6.ser.gz"
            try:
                self._dict = find_file(
                    path_to_dict,
                    searchpath=search_path,
                    url=_stanford_url,
                    verbose=False,
                    env_vars=("STANFORD_MODELS",),
                )
            except LookupError as e:
                raise LookupError(
                    "Could not find '%s' (tried using env. "
                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
                    % path_to_dict
                ) from e

            sihan_dir = "./data/"
            try:
                path_to_sihan_dir = find_dir(
                    sihan_dir,
                    url=_stanford_url,
                    verbose=False,
                    env_vars=("STANFORD_SEGMENTER",),
                )
                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
            except LookupError as e:
                raise LookupError(
                    "Could not find '%s' (tried using the "
                    "STANFORD_SEGMENTER environment variable)" % sihan_dir
                ) from e
        else:
            raise LookupError("Unsupported language {}".format(lang))

        try:
            self._model = find_file(
                model,
                searchpath=search_path,
                url=_stanford_url,
                verbose=False,
                env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
            )
        except LookupError as e:
            raise LookupError(
                "Could not find '%s' (tried using env. "
                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
            ) from e