def find_repptokenizer(self, repp_dirname): """ A module to find REPP tokenizer binary and its *repp.set* config file. """ if os.path.exists(repp_dirname): # If a full path is given. _repp_dir = repp_dirname else: # Try to find path to REPP directory in environment variables. _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',)) # Checks for the REPP binary and erg/repp.set config file. assert os.path.exists(_repp_dir+'/src/repp') assert os.path.exists(_repp_dir+'/erg/repp.set') return _repp_dir
def find_repptokenizer(self, repp_dirname): """ A module to find REPP tokenizer binary and its *repp.set* config file. """ if os.path.exists(repp_dirname): # If a full path is given. _repp_dir = repp_dirname else: # Try to find path to REPP directory in environment variables. _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",)) # Checks for the REPP binary and erg/repp.set config file. assert os.path.exists(_repp_dir + "/src/repp") assert os.path.exists(_repp_dir + "/erg/repp.set") return _repp_dir
def default_config(self, lang): """ Attempt to intialize Stanford Word Segmenter for the specified language using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables """ search_path = () if os.environ.get('STANFORD_SEGMENTER'): search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')} # init for Chinese-specific files self._dict = None self._sihan_corpora_dict = None self._sihan_post_processing = 'false' if lang == 'ar': self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter' model = 'arabic-segmenter-atb+bn+arztrain.ser.gz' elif lang == 'zh': self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier' model = 'pku.gz' self._sihan_post_processing = 'true' path_to_dict = 'dict-chris6.ser.gz' try: self._dict = find_file(path_to_dict, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=('STANFORD_MODELS',)) except LookupError: raise LookupError("Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict) sihan_dir = './data/' try: path_to_sihan_dir = find_dir(sihan_dir, url=_stanford_url, verbose=False, env_vars=('STANFORD_SEGMENTER',)) self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir) except LookupError: raise LookupError("Could not find '%s' (tried using the " "STANFORD_SEGMENTER environment variable)" % sihan_dir) else: raise LookupError("Unsupported language '%'" % lang) try: self._model = find_file(model, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',)) except LookupError: raise LookupError("Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
def find_maltparser(parser_dirname): """ A module to find MaltParser .jar file and its dependencies. """ if os.path.exists(parser_dirname): # If a full path is given. _malt_dir = parser_dirname else: # Try to find path to maltparser directory in environment variables. _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',)) # Checks that that the found directory contains all the necessary .jar malt_dependencies = ['','',''] _malt_jars = set(find_jars_within_path(_malt_dir)) _jars = set(os.path.split(jar)[1] for jar in _malt_jars) malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar']) assert malt_dependencies.issubset(_jars) assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)) return list(_malt_jars)
def default_config(self, lang): """ Attempt to intialize Stanford Word Segmenter for the specified language using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables """ search_path = () if os.environ.get("STANFORD_SEGMENTER"): search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")} # init for Chinese-specific files self._dict = None self._sihan_corpora_dict = None self._sihan_post_processing = "false" if lang == "ar": self._java_class = ( "edu.stanford.nlp.international.arabic.process.ArabicSegmenter" ) model = "arabic-segmenter-atb+bn+arztrain.ser.gz" elif lang == "zh": self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier" model = "pku.gz" self._sihan_post_processing = "true" path_to_dict = "dict-chris6.ser.gz" try: self._dict = find_file( path_to_dict, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=("STANFORD_MODELS",), ) except LookupError as e: raise LookupError( "Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict ) from e sihan_dir = "./data/" try: path_to_sihan_dir = find_dir( sihan_dir, url=_stanford_url, verbose=False, env_vars=("STANFORD_SEGMENTER",), ) self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir) except LookupError as e: raise LookupError( "Could not find '%s' (tried using the " "STANFORD_SEGMENTER environment variable)" % sihan_dir ) from e else: raise LookupError("Unsupported language {}".format(lang)) try: self._model = find_file( model, searchpath=search_path, url=_stanford_url, verbose=False, env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"), ) except LookupError as e: raise LookupError( "Could not find '%s' (tried using env. " "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model ) from e