def install_corenlp(dir=DEFAULT_CORENLP_DIR, url=DEFAULT_CORENLP_URL, logging_level=None, proxies=None): """ A fully automatic way to install and setting up the CoreNLP library to use the client functionality. Args: dir: the directory to download CoreNLP model into; alternatively can be set up with environment variable $CORENLP_HOME url: the link to download CoreNLP models logging_level: logging level to use during installation """ dir = os.path.expanduser(dir) set_logging_level(logging_level=logging_level, verbose=None) if os.path.exists(dir): logger.warn(f"Directory {dir} already exists. " f"Please install CoreNLP to a new directory.") return logger.info(f"Installing CoreNLP package into {dir}...") # First download the URL package logger.debug( f"Download to destination file: {os.path.join(dir, 'corenlp.zip')}") try: request_file(url + 'stanford-corenlp-latest.zip', os.path.join(dir, 'corenlp.zip'), proxies) except (KeyboardInterrupt, SystemExit): raise except Exception as e: raise RuntimeError( "Downloading CoreNLP zip file failed. " "Please try manual installation: https://stanfordnlp.github.io/CoreNLP/." ) from e # Unzip corenlp into dir logger.debug("Unzipping downloaded zip file...") unzip(dir, 'corenlp.zip') # By default CoreNLP will be unzipped into a version-dependent folder, # e.g., stanford-corenlp-4.0.0. We need some hack around that and move # files back into our designated folder logger.debug(f"Moving files into the designated folder at: {dir}") corenlp_dirname = get_root_from_zipfile(os.path.join(dir, 'corenlp.zip')) corenlp_dirname = os.path.join(dir, corenlp_dirname) for f in os.listdir(corenlp_dirname): shutil.move(os.path.join(corenlp_dirname, f), dir) # Remove original zip and folder logger.debug("Removing downloaded zip file...") os.remove(os.path.join(dir, 'corenlp.zip')) shutil.rmtree(corenlp_dirname) # Warn user to set up env if dir != DEFAULT_CORENLP_DIR: logger.warning( f"For customized installation location, please set the `CORENLP_HOME` " f"environment variable to the location of the installation. " f"In Unix, this is done with `export CORENLP_HOME={dir}`.")
def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None, use_gpu=True, **kwargs): self.lang, self.dir, self.kwargs = lang, dir, kwargs # set global logging level set_logging_level(logging_level, verbose) self.logging_level = logging.getLevelName(logger.level) # process different pipeline parameters lang, dir, package, processors = process_pipeline_parameters( lang, dir, package, processors) # Load resources.json to obtain latest packages. logger.debug('Loading resource file...') resources_filepath = os.path.join(dir, 'resources.json') if not os.path.exists(resources_filepath): raise Exception( f"Resources file not found at: {resources_filepath}. Try to download the model again." ) with open(resources_filepath) as infile: resources = json.load(infile) if lang in resources: if 'alias' in resources[lang]: logger.info( f'"{lang}" is an alias for "{resources[lang]["alias"]}"') lang = resources[lang]['alias'] lang_name = resources[lang][ 'lang_name'] if 'lang_name' in resources[lang] else '' else: logger.warning(f'Unsupported language: {lang}.') # Maintain load list self.load_list = maintain_processor_list( resources, lang, package, processors) if lang in resources else [] self.load_list = add_dependencies( resources, lang, self.load_list) if lang in resources else [] self.load_list = self.update_kwargs(kwargs, self.load_list) if len(self.load_list) == 0: raise Exception( 'No processor to load. Please check if your language or package is correctly set.' ) load_table = make_table(['Processor', 'Package'], [row[:2] for row in self.load_list]) logger.info( f'Loading these models for language: {lang} ({lang_name}):\n{load_table}' ) self.config = build_default_config(resources, lang, dir, self.load_list) self.config.update(kwargs) # Load processors self.processors = {} # configs that are the same for all processors pipeline_level_configs = {'lang': lang, 'mode': 'predict'} self.use_gpu = torch.cuda.is_available() and use_gpu logger.info("Use device: {}".format("gpu" if self.use_gpu else "cpu")) # set up processors pipeline_reqs_exceptions = [] for item in self.load_list: processor_name, _, _ = item logger.info('Loading: ' + processor_name) curr_processor_config = self.filter_config(processor_name, self.config) curr_processor_config.update(pipeline_level_configs) logger.debug('With settings: ') logger.debug(curr_processor_config) try: # try to build processor, throw an exception if there is a requirements issue self.processors[processor_name] = NAME_TO_PROCESSOR_CLASS[ processor_name](config=curr_processor_config, pipeline=self, use_gpu=self.use_gpu) except ProcessorRequirementsException as e: # if there was a requirements issue, add it to list which will be printed at end pipeline_reqs_exceptions.append(e) # add the broken processor to the loaded processors for the sake of analyzing the validity of the # entire proposed pipeline, but at this point the pipeline will not be built successfully self.processors[processor_name] = e.err_processor # if there are any processor exceptions, throw an exception to indicate pipeline build failure if pipeline_reqs_exceptions: logger.info('\n') raise PipelineRequirementsException(pipeline_reqs_exceptions) logger.info("Done loading processors!")
def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level=None, verbose=None, use_gpu=True, model_dir=None, **kwargs): self.lang, self.dir, self.kwargs = lang, dir, kwargs if model_dir is not None and dir == DEFAULT_MODEL_DIR: self.dir = model_dir # set global logging level set_logging_level(logging_level, verbose) # process different pipeline parameters lang, self.dir, package, processors = process_pipeline_parameters( lang, self.dir, package, processors) # Load resources.json to obtain latest packages. logger.debug('Loading resource file...') resources_filepath = os.path.join(self.dir, 'resources.json') if not os.path.exists(resources_filepath): raise ResourcesFileNotFoundError(resources_filepath) with open(resources_filepath) as infile: resources = json.load(infile) if lang in resources: if 'alias' in resources[lang]: logger.info( f'"{lang}" is an alias for "{resources[lang]["alias"]}"') lang = resources[lang]['alias'] lang_name = resources[lang][ 'lang_name'] if 'lang_name' in resources[lang] else '' else: logger.warning(f'Unsupported language: {lang}.') # Maintain load list processors = self.maybe_add_mwt(kwargs, resources, lang, processors) self.load_list = maintain_processor_list( resources, lang, package, processors) if lang in resources else [] self.load_list = add_dependencies( resources, lang, self.load_list) if lang in resources else [] self.load_list = self.update_kwargs(kwargs, self.load_list) if len(self.load_list) == 0: raise ValueError( 'No processors to load for language {}. Please check if your language or package is correctly set.' .format(lang)) load_table = make_table(['Processor', 'Package'], [row[:2] for row in self.load_list]) logger.info( f'Loading these models for language: {lang} ({lang_name}):\n{load_table}' ) self.config = build_default_config(resources, lang, self.dir, self.load_list) self.config.update(kwargs) # Load processors self.processors = {} # configs that are the same for all processors pipeline_level_configs = {'lang': lang, 'mode': 'predict'} self.use_gpu = torch.cuda.is_available() and use_gpu logger.info("Use device: {}".format("gpu" if self.use_gpu else "cpu")) # set up processors pipeline_reqs_exceptions = [] for item in self.load_list: processor_name, _, _ = item logger.info('Loading: ' + processor_name) curr_processor_config = self.filter_config(processor_name, self.config) curr_processor_config.update(pipeline_level_configs) # TODO: this is obviously a hack # a better solution overall would be to make a pretagged version of the pos annotator # and then subsequent modules can use those tags without knowing where those tags came from if "pretagged" in self.config and "pretagged" not in curr_processor_config: curr_processor_config["pretagged"] = self.config["pretagged"] logger.debug('With settings: ') logger.debug(curr_processor_config) try: # try to build processor, throw an exception if there is a requirements issue self.processors[processor_name] = NAME_TO_PROCESSOR_CLASS[ processor_name](config=curr_processor_config, pipeline=self, use_gpu=self.use_gpu) except ProcessorRequirementsException as e: # if there was a requirements issue, add it to list which will be printed at end pipeline_reqs_exceptions.append(e) # add the broken processor to the loaded processors for the sake of analyzing the validity of the # entire proposed pipeline, but at this point the pipeline will not be built successfully self.processors[processor_name] = e.err_processor except FileNotFoundError as e: # For a FileNotFoundError, we try to guess if there's # a missing model directory or file. If so, we # suggest the user try to download the models if 'model_path' in curr_processor_config: model_path = curr_processor_config['model_path'] model_dir, model_name = os.path.split(model_path) lang_dir = os.path.dirname(model_dir) if not os.path.exists(lang_dir): # model files for this language can't be found in the expected directory raise LanguageNotDownloadedError( lang, lang_dir, model_path) from e if processor_name not in resources[lang]: # user asked for a model which doesn't exist for this language? raise UnsupportedProcessorError(processor_name, lang) if not os.path.exists(model_path): model_name, _ = os.path.splitext(model_name) # TODO: before recommending this, check that such a thing exists in resources.json. # currently that case is handled by ignoring the model, anyway raise FileNotFoundError( 'Could not find model file %s, although there are other models downloaded for language %s. Perhaps you need to download a specific model. Try: stanza.download(lang="%s",package=None,processors={"%s":"%s"})' % (model_path, lang, lang, processor_name, model_name)) from e # if we couldn't find a more suitable description of the # FileNotFoundError, just raise the old error raise # if there are any processor exceptions, throw an exception to indicate pipeline build failure if pipeline_reqs_exceptions: logger.info('\n') raise PipelineRequirementsException(pipeline_reqs_exceptions) logger.info("Done loading processors!")