def download_models(languages=None, debug=False):
    logging.basicConfig(
        format="%(name)s - %(levelname)s - %(message)s",
        level=logging.DEBUG if debug else logging.INFO,
    )

    if not languages:
        languages = config("SUPPORTED_LANGUAGES", default="", cast=str)
    languages = cast_supported_languages(languages)

    for lang in languages:
        lang = lang.split("-")

        lang_slug = lang[0]
        model = lang[1] if len(lang) > 1 else None
        value = lang_to_model.get(lang_slug, {}).get(model, None)

        if model == "SPACY":
            if value.startswith("pip+"):
                model_name, pip_package = value[4:].split(":", 1)
                logger.debug("model name: {}".format(model_name))
                logger.debug("pip package: {}".format(pip_package))
                cmd = [
                    sys.executable,
                    "-m",
                    "pip",
                    "install",
                    "--no-deps",
                    "--no-cache-dir",
                    pip_package,
                ]
                logger.debug(" ".join(cmd))
                if subprocess.call(cmd, env=os.environ.copy()) == 0:
                    logger.debug("linking: {} to {}".format(
                        model_name, lang_slug))
                    package_path = get_package_path(model_name)
                    link(model_name,
                         lang_slug,
                         force=True,
                         model_path=package_path)
                else:
                    raise Exception("Error to download {}".format(lang_slug))
            elif lang_slug != value:
                logger.debug("downloading {}".format(value))
                download(value)
                logger.debug("linking: {} to {}".format(value, lang_slug))
                package_path = get_package_path(value)
                link(value, lang_slug, force=True, model_path=package_path)
            else:
                logger.debug("downloading {}".format(value))
                download(value)
        elif model == "BERT":
            download_bert(value)
Ejemplo n.º 2
0
    def train(self):
        with self.snapshot.training_lock():
            spacy_model_name = os.environ.get('NERD_SPACY_MODEL')
            with log_perf(f'{self.snapshot} TRAINING'):
                try:
                    self._nlp = spacy.load(spacy_model_name)
                except OSError:
                    logger.warning(
                        f"Spacy model '{spacy_model_name}' not found.  Downloading and installing."
                    )
                    from spacy.cli.download import download as spacy_download
                    spacy_download(spacy_model_name)
                    from spacy.cli import link
                    from spacy.util import get_package_path

                    package_path = get_package_path(spacy_model_name)
                    link(spacy_model_name,
                         spacy_model_name,
                         force=True,
                         package_path=package_path)
                    self._nlp = spacy.load(spacy_model_name)
                self._add_types()
                self._train_snapshot_texts()
                """ Only locking when saving to disk after training is done in memory """
            with log_perf(f'{self.snapshot} SAVING_TO_DISK'):
                if os.path.exists(self._path):
                    shutil.rmtree(self._path)
                self._nlp.to_disk(self._path)
Ejemplo n.º 3
0
def load_spacy_model(model):
    if model == "ru":
        try:
            from spacy.lang.ru import Russian
            return Russian()
        except OSError as e:
            raise OSError(
                "Failed to get spaCy Russian model. Install it using "
                "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git"
            ) from e
    import spacy
    try:
        return spacy.load(model)
    except OSError:
        spacy.cli.download(model)
        # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269
        from spacy.cli import link
        from spacy.util import get_package_path
        link(model, model, force=True, model_path=get_package_path(model))
        try:
            return spacy.load(model)
        except OSError as e:
            raise OSError(
                "Failed to get spaCy model. Download it manually using "
                "`python -m spacy download %s`." % model) from e
Ejemplo n.º 4
0
def get_nlp(lang="en"):
    """Load spaCy model for a given language, determined by `models' dict or by MODEL_ENV_VAR"""
    instance = nlp.get(lang)
    if instance is None:
        import spacy
        model = models.get(lang)
        if not model:
            models[lang] = model = os.environ.get("_".join((MODEL_ENV_VAR, lang.upper()))) or \
                                   os.environ.get(MODEL_ENV_VAR) or DEFAULT_MODEL.get(lang, "xx")
        started = time.time()
        with external_write_mode():
            print("Loading spaCy model '%s'... " % model, end="", flush=True)
            try:
                nlp[lang] = instance = spacy.load(model)
            except OSError:
                spacy.cli.download(model)
                # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269
                from spacy.cli import link
                from spacy.util import get_package_path
                link(model,
                     model,
                     force=True,
                     model_path=get_package_path(model))
                try:
                    nlp[lang] = instance = spacy.load(model)
                except OSError as e:
                    raise OSError(
                        "Failed to get spaCy model. Download it manually using "
                        "`python -m spacy download %s`." % model) from e
            tokenizer[lang] = instance.tokenizer
            instance.tokenizer = lambda words: spacy.tokens.Doc(instance.vocab,
                                                                words=words)
            print("Done (%.3fs)." % (time.time() - started))
    return instance
Ejemplo n.º 5
0
 def load_lang_model(lang: str, disable: List[str]):
     """Load spaCy language model or download if
         model is available and not installed
     
     Arguments:
         lang {str} -- language
         disable {List[str]} -- If only using tokenizer, can disable ['parser', 'ner', 'textcat']
     
     Returns:
         [type] -- [description]
     """
     if 'coref' in lang:
         try:
             return spacy.load(lang, disable=disable)  #
         except Exception as e:
             return SpacyAnnotator.load_lang_model(lang.split('_')[0],
                                                   disable=disable)
     try:
         return spacy.load(lang, disable=disable)
     except OSError:
         logger.warning(
             f"Spacy models '{lang}' not found.  Downloading and installing."
         )
         spacy_download(lang)
         # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy
         # 2.1.0, which removed the linking that was done in spacy 2.0.  importlib doesn't find
         # packages that were installed in the same python session, so the way `spacy_download`
         # works in 2.1.0 is broken for this use case.  These four lines can probably be removed
         # at some point in the future, once spacy has figured out a better way to handle this.
         # See https://github.com/explosion/spaCy/issues/3435.
         from spacy.cli import link
         from spacy.util import get_package_path
         package_path = get_package_path(lang)
         link(lang, lang, model_path=package_path)
         return spacy.load(lang, disable=disable)
Ejemplo n.º 6
0
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy
            # 2.1.0, which removed the linking that was done in spacy 2.0.  importlib doesn't find
            # packages that were installed in the same python session, so the way `spacy_download`
            # works in 2.1.0 is broken for this use case.  These four lines can probably be removed
            # at some point in the future, once spacy has figured out a better way to handle this.
            # See https://github.com/explosion/spaCy/issues/3435.
            from spacy.cli import link
            from spacy.util import get_package_path
            package_path = get_package_path(spacy_model_name)
            link(spacy_model_name, spacy_model_name, model_path=package_path)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Ejemplo n.º 7
0
def load_spacy_model(model):
    if model == "ru":
        try:
            from spacy.lang.ru import Russian
            return Russian()
            # import stanfordnlp
            # from spacy_stanfordnlp import StanfordNLPLanguage
            # snlp = stanfordnlp.Pipeline(lang="ru", models_dir="/cs/labs/oabend/lovodkin93/TUPA1_project/stanfordnlp_resources")
            # return StanfordNLPLanguage(snlp)

            #import stanza
            #return stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner', models_dir="//stanza_resources")
        except OSError as e:
            raise OSError(
                "Failed to get spaCy Russian model. Install it using "
                "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git"
            ) from e
    import spacy
    try:
        return spacy.load(model)
    except OSError:
        spacy.cli.download(model)
        # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269
        from spacy.cli import link
        from spacy.util import get_package_path
        link(model, model, force=True, model_path=get_package_path(model))
        try:
            return spacy.load(model)
        except OSError as e:
            raise OSError(
                "Failed to get spaCy model. Download it manually using "
                "`python -m spacy download %s`." % model) from e
Ejemplo n.º 8
0
def download_spacy_models(languages=None, debug=False):
    logging.basicConfig(
        format="%(name)s - %(levelname)s - %(message)s",
        level=logging.DEBUG if debug else logging.INFO,
    )
    if not languages:
        languages = config("SUPPORTED_LANGUAGES", default="", cast=str)
    languages = cast_supported_languages(languages)

    logger.info("Importing langs: {}".format(", ".join(languages.keys())))

    for lang, value in languages.items():
        if value.startswith("pip+"):
            model_name, pip_package = value[4:].split(":", 1)
            logger.debug("model name: {}".format(model_name))
            logger.debug("pip package: {}".format(pip_package))
            cmd = [
                sys.executable,
                "-m",
                "pip",
                "install",
                "--no-deps",
                "--no-cache-dir",
                pip_package,
            ]
            logger.debug(" ".join(cmd))
            if subprocess.call(cmd, env=os.environ.copy()) == 0:
                logger.debug("linking: {} to {}".format(model_name, lang))
                package_path = get_package_path(model_name)
                link(model_name, lang, force=True, model_path=package_path)
            else:
                raise Exception("Error to download {}".format(lang))
        elif lang != value:
            logger.debug("downloading {}".format(value))
            download(value)
            logger.debug("linking: {} to {}".format(value, lang))
            package_path = get_package_path(value)
            link(value, lang, force=True, model_path=package_path)
        else:
            logger.debug("downloading {}".format(value))
            download(value)
Ejemplo n.º 9
0
def spacy_downloader(spacy_model_name: str, pos_tags: bool, parse: bool,
                     ner: bool) -> SpacyModelType:
    '''
    This is a copy of allennlp.common.util.get_spacy_model function. This in  
    affect downloads the relevant spacy model and loads the model with the  
    relevant taggers e.g. POS, Parse and NER taggers for that spacy model which  
    is language dependent.

    Spacy can have multiple trained models per language based on size.

    :param spacy_model_name: Name of the Spacy model e.g. en_core_web_sm
    :param pos_tags: Whether or not the returned Spacy model should perform 
                     POS tagging.
    :param parse: Whether or not the returned Spacy model should perform 
                  Parsing.
    :param ner: Whether or not the returned Spacy model should perform 
                  NER.
    :returns: The relevant Spacy model.
    '''

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        # This needs manually updating each time Spacy is updated. Supported
        # languages can be found here: https://spacy.io/usage/models
        supported_codes = [
            'de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt', 'xx'
        ]
        lang_code = spacy_model_name[:2]
        if lang_code not in supported_codes:
            raise ValueError('Spacy does not support the following language '
                             f'{lang_code}. These languages are supported '
                             f'{supported_codes}')

        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            print(f"Spacy models '{spacy_model_name}' not found. "
                  "Downloading and installing.")
            spacy_download(spacy_model_name)
            from spacy.cli import link
            from spacy.util import get_package_path
            package_path = get_package_path(spacy_model_name)
            link(spacy_model_name, spacy_model_name, model_path=package_path)
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Ejemplo n.º 10
0
def download_spacy_models(languages=None, debug=False):
    logging.basicConfig(
        format='%(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG if debug else logging.INFO)
    if not languages:
        languages = config('SUPPORTED_LANGUAGES', default='', cast=str)
    languages = cast_supported_languages(languages)

    logger.info('Importing langs: {}'.format(', '.join(languages.keys())))

    for lang, value in languages.items():
        if value.startswith('pip+'):
            model_name, pip_package = value[4:].split(':', 1)
            logger.debug('model name: {}'.format(model_name))
            logger.debug('pip package: {}'.format(pip_package))
            cmd = [
                sys.executable,
                '-m',
                'pip',
                'install',
                '--no-deps',
                pip_package,
            ]
            logger.debug(' '.join(cmd))
            if subprocess.call(cmd, env=os.environ.copy()) is 0:
                logger.debug('linking: {} to {}'.format(model_name, lang))
                package_path = get_package_path(model_name)
                link(model_name, lang, force=True, model_path=package_path)
            else:
                raise Exception('Error to download {}'.format(lang))
        elif lang is not value:
            logger.debug('downloading {}'.format(value))
            download(value)
            logger.debug('linking: {} to {}'.format(value, lang))
            package_path = get_package_path(value)
            link(value, lang, force=True, model_path=package_path)
        else:
            logger.debug('downloading {}'.format(value))
            download(value)
Ejemplo n.º 11
0
def link_lang_spacy(lang, lang_path):
    origin_path = os.path.join(get_package_path('spacy'), 'lang', lang)
    try:
        symlink_to(Path(origin_path), os.path.abspath(lang_path))
        try:
            importlib.import_module('spacy.lang.{}'.format(lang))
            print('link created')
        except Exception as e:
            print('link not created')
            raise e
    except Exception as e:
        print('error to create link to {} from {}'.format(lang, lang_path))
        raise e
Ejemplo n.º 12
0
def link_lang_spacy(lang, lang_path):
    origin_path = os.path.join(str(get_package_path("spacy").resolve()),
                               "lang", lang)
    try:
        symlink_to(Path(origin_path), os.path.abspath(lang_path))
        try:
            importlib.import_module("spacy.lang.{}".format(lang))
            print("link created")
        except Exception as e:
            print("link not created")
            raise e
    except Exception as e:
        print("error to create link to {} from {}".format(lang, lang_path))
        raise e
Ejemplo n.º 13
0
def spacy_model(model: str = 'en_core_web_md') -> None:
    """
    Download spaCy model.

    Parameters
    ----------
    model
        Model to be downloaded
    """
    try:
        spacy.load(model)
    except OSError:
        download(model)

        # https://github.com/explosion/spaCy/issues/3435
        package_path = get_package_path(model)
        link(model, model, force=True, model_path=package_path)
Ejemplo n.º 14
0
def download_model(model_name: str):
    """
    Downloads and links language trained model.

        >>> from nerd import ner
        >>> ner.download_model(model_name='en_core_web_sm')
        >>> supported languages 'en_core_web_sm', 'de_core_news_sm', 'fr_core_news_sm',
        >>> 'es_core_news_sm', 'pt_core_news_sm', 'it_core_news_sm',
        >>> 'nl_core_news_sm', 'el_core_news_sm', 'xx_ent_wiki_sm'

    :param model_name: Model package name.
    :type model_name: str
    """

    download(model_name)
    package_path = get_package_path(model_name)
    link(model_name, model_name, force=True, model_path=package_path)
Ejemplo n.º 15
0
def test_util_get_package_path(package):
    """Test that a Path object is returned for a package name."""
    path = util.get_package_path(package)
    assert isinstance(path, Path)
Ejemplo n.º 16
0
def test_util_get_package_path(package):
    """Test that a Path object is returned for a package name."""
    path = util.get_package_path(package)
    assert isinstance(path, Path)
Ejemplo n.º 17
0
    except:
        return ""
    return Topic[1]


def get_sentiment(text):
    """
    :param text: Input Text
    :return: Sentiment for Text
    """
    return TextBlob(text).sentiment[0]


# Load Spacy
model_name = "de_core_news_sm"
package_path = get_package_path(model_name)
link(model_name, model_name, force=True, model_path=package_path)
nlp = spacy.load("de_core_news_sm")

# Load News
news = pd.concat([pd.read_csv(f, sep=";", header="infer", encoding="UTF-8") for f in
                  glob.glob(r"../out/Polizeiberichte_transformed*.csv")], sort=True)
news.drop_duplicates()
news = news.fillna("")

news["Comb_all"] = news["Hauptartikel_lem_clean_no_stop"] + " " + news["Ueberschrift_kombi"]

# Split words before further processing
news["Ueberschrift_kombi"] = news["Ueberschrift_kombi"].apply(lambda x: [item for item in x.split()])
news["Hauptartikel_lem_clean_no_stop"] = news["Hauptartikel_lem_clean_no_stop"].apply(
    lambda x: [item for item in x.split()])