def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") spacy_download(spacy_model_name) # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy # 2.1.0, which removed the linking that was done in spacy 2.0. importlib doesn't find # packages that were installed in the same python session, so the way `spacy_download` # works in 2.1.0 is broken for this use case. These four lines can probably be removed # at some point in the future, once spacy has figured out a better way to handle this. # See https://github.com/explosion/spaCy/issues/3435. from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(spacy_model_name) link(spacy_model_name, spacy_model_name, model_path=package_path) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def load_spacy_model(model): if model == "ru": try: from spacy.lang.ru import Russian return Russian() # import stanfordnlp # from spacy_stanfordnlp import StanfordNLPLanguage # snlp = stanfordnlp.Pipeline(lang="ru", models_dir="/cs/labs/oabend/lovodkin93/TUPA1_project/stanfordnlp_resources") # return StanfordNLPLanguage(snlp) #import stanza #return stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner', models_dir="//stanza_resources") except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git" ) from e import spacy try: return spacy.load(model) except OSError: spacy.cli.download(model) # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269 from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def load_spacy_model(model): if model == "ru": try: from spacy.lang.ru import Russian return Russian() except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git" ) from e import spacy try: return spacy.load(model) except OSError: spacy.cli.download(model) # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269 from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def get_nlp(lang="en"): """Load spaCy model for a given language, determined by `models' dict or by MODEL_ENV_VAR""" instance = nlp.get(lang) if instance is None: import spacy model = models.get(lang) if not model: models[lang] = model = os.environ.get("_".join((MODEL_ENV_VAR, lang.upper()))) or \ os.environ.get(MODEL_ENV_VAR) or DEFAULT_MODEL.get(lang, "xx") started = time.time() with external_write_mode(): print("Loading spaCy model '%s'... " % model, end="", flush=True) try: nlp[lang] = instance = spacy.load(model) except OSError: spacy.cli.download(model) # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269 from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: nlp[lang] = instance = spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e tokenizer[lang] = instance.tokenizer instance.tokenizer = lambda words: spacy.tokens.Doc(instance.vocab, words=words) print("Done (%.3fs)." % (time.time() - started)) return instance
def train(self): with self.snapshot.training_lock(): spacy_model_name = os.environ.get('NERD_SPACY_MODEL') with log_perf(f'{self.snapshot} TRAINING'): try: self._nlp = spacy.load(spacy_model_name) except OSError: logger.warning( f"Spacy model '{spacy_model_name}' not found. Downloading and installing." ) from spacy.cli.download import download as spacy_download spacy_download(spacy_model_name) from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(spacy_model_name) link(spacy_model_name, spacy_model_name, force=True, package_path=package_path) self._nlp = spacy.load(spacy_model_name) self._add_types() self._train_snapshot_texts() """ Only locking when saving to disk after training is done in memory """ with log_perf(f'{self.snapshot} SAVING_TO_DISK'): if os.path.exists(self._path): shutil.rmtree(self._path) self._nlp.to_disk(self._path)
def load_lang_model(lang: str, disable: List[str]): """Load spaCy language model or download if model is available and not installed Arguments: lang {str} -- language disable {List[str]} -- If only using tokenizer, can disable ['parser', 'ner', 'textcat'] Returns: [type] -- [description] """ if 'coref' in lang: try: return spacy.load(lang, disable=disable) # except Exception as e: return SpacyAnnotator.load_lang_model(lang.split('_')[0], disable=disable) try: return spacy.load(lang, disable=disable) except OSError: logger.warning( f"Spacy models '{lang}' not found. Downloading and installing." ) spacy_download(lang) # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy # 2.1.0, which removed the linking that was done in spacy 2.0. importlib doesn't find # packages that were installed in the same python session, so the way `spacy_download` # works in 2.1.0 is broken for this use case. These four lines can probably be removed # at some point in the future, once spacy has figured out a better way to handle this. # See https://github.com/explosion/spaCy/issues/3435. from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(lang) link(lang, lang, model_path=package_path) return spacy.load(lang, disable=disable)
def spacy_downloader(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: ''' This is a copy of allennlp.common.util.get_spacy_model function. This in affect downloads the relevant spacy model and loads the model with the relevant taggers e.g. POS, Parse and NER taggers for that spacy model which is language dependent. Spacy can have multiple trained models per language based on size. :param spacy_model_name: Name of the Spacy model e.g. en_core_web_sm :param pos_tags: Whether or not the returned Spacy model should perform POS tagging. :param parse: Whether or not the returned Spacy model should perform Parsing. :param ner: Whether or not the returned Spacy model should perform NER. :returns: The relevant Spacy model. ''' options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: # This needs manually updating each time Spacy is updated. Supported # languages can be found here: https://spacy.io/usage/models supported_codes = [ 'de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt', 'xx' ] lang_code = spacy_model_name[:2] if lang_code not in supported_codes: raise ValueError('Spacy does not support the following language ' f'{lang_code}. These languages are supported ' f'{supported_codes}') disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: print(f"Spacy models '{spacy_model_name}' not found. " "Downloading and installing.") spacy_download(spacy_model_name) from spacy.cli import link from spacy.util import get_package_path package_path = get_package_path(spacy_model_name) link(spacy_model_name, spacy_model_name, model_path=package_path) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def download_models(languages=None, debug=False): logging.basicConfig( format="%(name)s - %(levelname)s - %(message)s", level=logging.DEBUG if debug else logging.INFO, ) if not languages: languages = config("SUPPORTED_LANGUAGES", default="", cast=str) languages = cast_supported_languages(languages) for lang in languages: lang = lang.split("-") lang_slug = lang[0] model = lang[1] if len(lang) > 1 else None value = lang_to_model.get(lang_slug, {}).get(model, None) if model == "SPACY": if value.startswith("pip+"): model_name, pip_package = value[4:].split(":", 1) logger.debug("model name: {}".format(model_name)) logger.debug("pip package: {}".format(pip_package)) cmd = [ sys.executable, "-m", "pip", "install", "--no-deps", "--no-cache-dir", pip_package, ] logger.debug(" ".join(cmd)) if subprocess.call(cmd, env=os.environ.copy()) == 0: logger.debug("linking: {} to {}".format( model_name, lang_slug)) package_path = get_package_path(model_name) link(model_name, lang_slug, force=True, model_path=package_path) else: raise Exception("Error to download {}".format(lang_slug)) elif lang_slug != value: logger.debug("downloading {}".format(value)) download(value) logger.debug("linking: {} to {}".format(value, lang_slug)) package_path = get_package_path(value) link(value, lang_slug, force=True, model_path=package_path) else: logger.debug("downloading {}".format(value)) download(value) elif model == "BERT": download_bert(value)
def download_model(model_name: str): """ Downloads and links language trained model. >>> from nerd import ner >>> ner.download_model(model_name='en_core_web_sm') >>> supported languages 'en_core_web_sm', 'de_core_news_sm', 'fr_core_news_sm', >>> 'es_core_news_sm', 'pt_core_news_sm', 'it_core_news_sm', >>> 'nl_core_news_sm', 'el_core_news_sm', 'xx_ent_wiki_sm' :param model_name: Model package name. :type model_name: str """ download(model_name) package_path = get_package_path(model_name) link(model_name, model_name, force=True, model_path=package_path)
def spacy_model(model: str = 'en_core_web_md') -> None: """ Download spaCy model. Parameters ---------- model Model to be downloaded """ try: spacy.load(model) except OSError: download(model) # https://github.com/explosion/spaCy/issues/3435 package_path = get_package_path(model) link(model, model, force=True, model_path=package_path)
def download_spacy_models(languages=None, debug=False): logging.basicConfig( format="%(name)s - %(levelname)s - %(message)s", level=logging.DEBUG if debug else logging.INFO, ) if not languages: languages = config("SUPPORTED_LANGUAGES", default="", cast=str) languages = cast_supported_languages(languages) logger.info("Importing langs: {}".format(", ".join(languages.keys()))) for lang, value in languages.items(): if value.startswith("pip+"): model_name, pip_package = value[4:].split(":", 1) logger.debug("model name: {}".format(model_name)) logger.debug("pip package: {}".format(pip_package)) cmd = [ sys.executable, "-m", "pip", "install", "--no-deps", "--no-cache-dir", pip_package, ] logger.debug(" ".join(cmd)) if subprocess.call(cmd, env=os.environ.copy()) == 0: logger.debug("linking: {} to {}".format(model_name, lang)) package_path = get_package_path(model_name) link(model_name, lang, force=True, model_path=package_path) else: raise Exception("Error to download {}".format(lang)) elif lang != value: logger.debug("downloading {}".format(value)) download(value) logger.debug("linking: {} to {}".format(value, lang)) package_path = get_package_path(value) link(value, lang, force=True, model_path=package_path) else: logger.debug("downloading {}".format(value)) download(value)
def download_spacy_models(languages=None, debug=False): logging.basicConfig( format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG if debug else logging.INFO) if not languages: languages = config('SUPPORTED_LANGUAGES', default='', cast=str) languages = cast_supported_languages(languages) logger.info('Importing langs: {}'.format(', '.join(languages.keys()))) for lang, value in languages.items(): if value.startswith('pip+'): model_name, pip_package = value[4:].split(':', 1) logger.debug('model name: {}'.format(model_name)) logger.debug('pip package: {}'.format(pip_package)) cmd = [ sys.executable, '-m', 'pip', 'install', '--no-deps', pip_package, ] logger.debug(' '.join(cmd)) if subprocess.call(cmd, env=os.environ.copy()) is 0: logger.debug('linking: {} to {}'.format(model_name, lang)) package_path = get_package_path(model_name) link(model_name, lang, force=True, model_path=package_path) else: raise Exception('Error to download {}'.format(lang)) elif lang is not value: logger.debug('downloading {}'.format(value)) download(value) logger.debug('linking: {} to {}'.format(value, lang)) package_path = get_package_path(value) link(value, lang, force=True, model_path=package_path) else: logger.debug('downloading {}'.format(value)) download(value)
return "" return Topic[1] def get_sentiment(text): """ :param text: Input Text :return: Sentiment for Text """ return TextBlob(text).sentiment[0] # Load Spacy model_name = "de_core_news_sm" package_path = get_package_path(model_name) link(model_name, model_name, force=True, model_path=package_path) nlp = spacy.load("de_core_news_sm") # Load News news = pd.concat([pd.read_csv(f, sep=";", header="infer", encoding="UTF-8") for f in glob.glob(r"../out/Polizeiberichte_transformed*.csv")], sort=True) news.drop_duplicates() news = news.fillna("") news["Comb_all"] = news["Hauptartikel_lem_clean_no_stop"] + " " + news["Ueberschrift_kombi"] # Split words before further processing news["Ueberschrift_kombi"] = news["Ueberschrift_kombi"].apply(lambda x: [item for item in x.split()]) news["Hauptartikel_lem_clean_no_stop"] = news["Hauptartikel_lem_clean_no_stop"].apply( lambda x: [item for item in x.split()]) news["Comb_all"] = news["Comb_all"].apply(lambda x: [item for item in x.split()])