Ejemplo n.º 1
0
    def __init__(
        self,
        dataset: str,
        l1: str,
        l2: str,
        use_tokenizer: bool = True,
        max_tokens_per_doc=-1,
        max_chars_per_doc=-1,
        in_memory: bool = True,
        **corpusargs,
    ):
        """
        Instantiates a Parallel Corpus from OPUS (http://opus.nlpl.eu/)
        :param dataset: Name of the dataset (one of "tatoeba")
        :param l1: Language code of first language in pair ("en", "de", etc.)
        :param l2: Language code of second language in pair ("en", "de", etc.)
        :param use_tokenizer: Whether or not to use in-built tokenizer
        :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
        :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters
        :param in_memory: If True, keeps dataset fully in memory
        """

        if l1 > l2:
            l1, l2 = l2, l1

        # check if dataset is supported
        supported_datasets = ["tatoeba", "subtitles"]
        if dataset not in supported_datasets:
            log.error(f"Dataset must be one of: {supported_datasets}")

        # set file names
        if dataset == "tatoeba":
            link = f"https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/{l1}-{l2}.txt.zip"

            l1_file = flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"Tatoeba.{l1}-{l2}.{l1}"
            l2_file = flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"Tatoeba.{l1}-{l2}.{l2}"

        # set file names
        if dataset == "subtitles":
            link = f"https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/{l1}-{l2}.txt.zip"

            l1_file = flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"OpenSubtitles.{l1}-{l2}.{l1}"
            l2_file = flair.cache_root / "datasets" / dataset / f"{l1}-{l2}" / f"OpenSubtitles.{l1}-{l2}.{l2}"

        # download and unzip in file structure if necessary
        if not l1_file.exists():
            path = cached_path(link, Path("datasets") / dataset / f"{l1}-{l2}")
            unzip_file(path, flair.cache_root / Path("datasets") / dataset / f"{l1}-{l2}")

        # instantiate corpus
        super(OpusParallelCorpus, self).__init__(
            l1_file,
            l2_file,
            name=f"{dataset}-{l1_file}-{l2_file}",
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            **corpusargs,
        )
Ejemplo n.º 2
0
    def __init__(
        self,
        **corpusargs,
    ):
        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        data_folder = Path(flair.cache_root) / "datasets" / dataset_name

        # download data if necessary
        if not (data_folder / "train.txt").is_file():

            # download senteval datasets if necessary und unzip
            senteval_path = "https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip"
            cached_path(senteval_path, Path("datasets") / "senteval")
            senteval_folder = Path(flair.cache_root) / "datasets" / "senteval"
            unzip_file(senteval_folder / "datasmall_NB_ACL12.zip",
                       senteval_folder)

            # create dataset directory if necessary
            if not os.path.exists(data_folder):
                os.makedirs(data_folder)

            # create train.txt file by iterating over pos and neg file
            with open(data_folder / "train.txt", "a") as train_file:

                with open(senteval_folder / "data" / "customerr" /
                          "custrev.pos",
                          encoding="latin1") as file:
                    for line in file:
                        train_file.write(f"__label__POSITIVE {line}")

                with open(senteval_folder / "data" / "customerr" /
                          "custrev.neg",
                          encoding="latin1") as file:
                    for line in file:
                        train_file.write(f"__label__NEGATIVE {line}")

        super(SENTEVAL_CR, self).__init__(
            data_folder,
            label_type='sentiment',
            tokenizer=segtok_tokenizer,
            **corpusargs,
        )
Ejemplo n.º 3
0
    def _fetch_model(model_name) -> str:

        # core Flair models on Huggingface ModelHub
        huggingface_model_map = {
            "ner": "flair/ner-english",
            "ner-fast": "flair/ner-english-fast",
            "ner-ontonotes": "flair/ner-english-ontonotes",
            "ner-ontonotes-fast": "flair/ner-english-ontonotes-fast",
            # Large NER models,
            "ner-large": "flair/ner-english-large",
            "ner-ontonotes-large": "flair/ner-english-ontonotes-large",
            "de-ner-large": "flair/ner-german-large",
            "nl-ner-large": "flair/ner-dutch-large",
            "es-ner-large": "flair/ner-spanish-large",
            # Multilingual NER models
            "ner-multi": "flair/ner-multi",
            "multi-ner": "flair/ner-multi",
            "ner-multi-fast": "flair/ner-multi-fast",
            # English POS models
            "upos": "flair/upos-english",
            "upos-fast": "flair/upos-english-fast",
            "pos": "flair/pos-english",
            "pos-fast": "flair/pos-english-fast",
            # Multilingual POS models
            "pos-multi": "flair/upos-multi",
            "multi-pos": "flair/upos-multi",
            "pos-multi-fast": "flair/upos-multi-fast",
            "multi-pos-fast": "flair/upos-multi-fast",
            # English SRL models
            "frame": "flair/frame-english",
            "frame-fast": "flair/frame-english-fast",
            # English chunking models
            "chunk": "flair/chunk-english",
            "chunk-fast": "flair/chunk-english-fast",
            # Language-specific NER models
            "da-ner": "flair/ner-danish",
            "de-ner": "flair/ner-german",
            "de-ler": "flair/ner-german-legal",
            "de-ner-legal": "flair/ner-german-legal",
            "fr-ner": "flair/ner-french",
            "nl-ner": "flair/ner-dutch",
        }

        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        hu_model_map = {
            # English NER models
            "ner":
            "/".join([hu_path, "ner", "en-ner-conll03-v0.4.pt"]),
            "ner-pooled":
            "/".join([hu_path, "ner-pooled", "en-ner-conll03-pooled-v0.5.pt"]),
            "ner-fast":
            "/".join([hu_path, "ner-fast", "en-ner-fast-conll03-v0.4.pt"]),
            "ner-ontonotes":
            "/".join([hu_path, "ner-ontonotes", "en-ner-ontonotes-v0.4.pt"]),
            "ner-ontonotes-fast":
            "/".join([
                hu_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt"
            ]),
            # Multilingual NER models
            "ner-multi":
            "/".join([hu_path, "multi-ner", "quadner-large.pt"]),
            "multi-ner":
            "/".join([hu_path, "multi-ner", "quadner-large.pt"]),
            "ner-multi-fast":
            "/".join([hu_path, "multi-ner-fast", "ner-multi-fast.pt"]),
            # English POS models
            "upos":
            "/".join([hu_path, "upos", "en-pos-ontonotes-v0.4.pt"]),
            "upos-fast":
            "/".join([hu_path, "upos-fast", "en-upos-ontonotes-fast-v0.4.pt"]),
            "pos":
            "/".join([hu_path, "pos", "en-pos-ontonotes-v0.5.pt"]),
            "pos-fast":
            "/".join([hu_path, "pos-fast", "en-pos-ontonotes-fast-v0.5.pt"]),
            # Multilingual POS models
            "pos-multi":
            "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]),
            "multi-pos":
            "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]),
            "pos-multi-fast":
            "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]),
            "multi-pos-fast":
            "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]),
            # English SRL models
            "frame":
            "/".join([hu_path, "frame", "en-frame-ontonotes-v0.4.pt"]),
            "frame-fast":
            "/".join(
                [hu_path, "frame-fast", "en-frame-ontonotes-fast-v0.4.pt"]),
            # English chunking models
            "chunk":
            "/".join([hu_path, "chunk", "en-chunk-conll2000-v0.4.pt"]),
            "chunk-fast":
            "/".join(
                [hu_path, "chunk-fast", "en-chunk-conll2000-fast-v0.4.pt"]),
            # Danish models
            "da-pos":
            "/".join([hu_path, "da-pos", "da-pos-v0.1.pt"]),
            "da-ner":
            "/".join([hu_path, "NER-danish", "da-ner-v0.1.pt"]),
            # German models
            "de-pos":
            "/".join([hu_path, "de-pos", "de-pos-ud-hdt-v0.5.pt"]),
            "de-pos-tweets":
            "/".join([hu_path, "de-pos-tweets", "de-pos-twitter-v0.1.pt"]),
            "de-ner":
            "/".join([hu_path, "de-ner", "de-ner-conll03-v0.4.pt"]),
            "de-ner-germeval":
            "/".join([hu_path, "de-ner-germeval", "de-ner-germeval-0.4.1.pt"]),
            "de-ler":
            "/".join([hu_path, "de-ner-legal", "de-ner-legal.pt"]),
            "de-ner-legal":
            "/".join([hu_path, "de-ner-legal", "de-ner-legal.pt"]),
            # French models
            "fr-ner":
            "/".join([hu_path, "fr-ner", "fr-ner-wikiner-0.4.pt"]),
            # Dutch models
            "nl-ner":
            "/".join([hu_path, "nl-ner", "nl-ner-bert-conll02-v0.8.pt"]),
            "nl-ner-rnn":
            "/".join([hu_path, "nl-ner-rnn", "nl-ner-conll02-v0.5.pt"]),
            # Malayalam models
            "ml-pos":
            "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-xpos-model.pt",
            "ml-upos":
            "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-upos-model.pt",
            # Portuguese models
            "pt-pos-clinical":
            "/".join([
                hu_path,
                "pt-pos-clinical",
                "pucpr-flair-clinical-pos-tagging-best-model.pt",
            ]),
            # Keyphase models
            "keyphrase":
            "/".join([hu_path, "keyphrase", "keyphrase-en-scibert.pt"]),
            "negation-speculation":
            "/".join([
                hu_path, "negation-speculation",
                "negation-speculation-model.pt"
            ]),
            # Biomedical models
            "hunflair-paper-cellline":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "cellline",
                "hunflair-celline-v1.0.pt",
            ]),
            "hunflair-paper-chemical":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "chemical",
                "hunflair-chemical-v1.0.pt",
            ]),
            "hunflair-paper-disease":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "disease",
                "hunflair-disease-v1.0.pt",
            ]),
            "hunflair-paper-gene":
            "/".join([
                hu_path, "hunflair_smallish_models", "gene",
                "hunflair-gene-v1.0.pt"
            ]),
            "hunflair-paper-species":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "species",
                "hunflair-species-v1.0.pt",
            ]),
            "hunflair-cellline":
            "/".join([
                hu_path,
                "hunflair_smallish_models",
                "cellline",
                "hunflair-celline-v1.0.pt",
            ]),
            "hunflair-chemical":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-chemical",
                "hunflair-chemical-full-v1.0.pt",
            ]),
            "hunflair-disease":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-disease",
                "hunflair-disease-full-v1.0.pt",
            ]),
            "hunflair-gene":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-gene",
                "hunflair-gene-full-v1.0.pt",
            ]),
            "hunflair-species":
            "/".join([
                hu_path,
                "hunflair_allcorpus_models",
                "huner-species",
                "hunflair-species-full-v1.1.pt",
            ]),
        }

        cache_dir = Path("models")

        get_from_model_hub = False

        # check if model name is a valid local file
        if Path(model_name).exists():
            model_path = model_name

        # check if model key is remapped to HF key - if so, print out information
        elif model_name in huggingface_model_map:

            # get mapped name
            hf_model_name = huggingface_model_map[model_name]

            # use mapped name instead
            model_name = hf_model_name
            get_from_model_hub = True

        # if not, check if model key is remapped to direct download location. If so, download model
        elif model_name in hu_model_map:
            model_path = cached_path(hu_model_map[model_name],
                                     cache_dir=cache_dir)

        # special handling for the taggers by the @redewiegergabe project (TODO: move to model hub)
        elif model_name == "de-historic-indirect":
            model_file = flair.cache_root / cache_dir / "indirect" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/indirect.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "indirect.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "indirect" /
                             "final-model.pt")

        elif model_name == "de-historic-direct":
            model_file = flair.cache_root / cache_dir / "direct" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/direct.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "direct.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "direct" /
                             "final-model.pt")

        elif model_name == "de-historic-reported":
            model_file = flair.cache_root / cache_dir / "reported" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/reported.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "reported.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "reported" /
                             "final-model.pt")

        elif model_name == "de-historic-free-indirect":
            model_file = flair.cache_root / cache_dir / "freeIndirect" / "final-model.pt"
            if not model_file.exists():
                cached_path(
                    "http://www.redewiedergabe.de/models/freeIndirect.zip",
                    cache_dir=cache_dir,
                )
                unzip_file(
                    flair.cache_root / cache_dir / "freeIndirect.zip",
                    flair.cache_root / cache_dir,
                )
            model_path = str(flair.cache_root / cache_dir / "freeIndirect" /
                             "final-model.pt")

        # for all other cases (not local file or special download location), use HF model hub
        else:
            get_from_model_hub = True

        # if not a local file, get from model hub
        if get_from_model_hub:
            hf_model_name = "pytorch_model.bin"
            revision = "main"

            if "@" in model_name:
                model_name_split = model_name.split("@")
                revision = model_name_split[-1]
                model_name = model_name_split[0]

            # use model name as subfolder
            if "/" in model_name:
                model_folder = model_name.split("/", maxsplit=1)[1]
            else:
                model_folder = model_name

            # Lazy import
            from huggingface_hub import cached_download, hf_hub_url

            url = hf_hub_url(model_name,
                             revision=revision,
                             filename=hf_model_name)

            try:
                model_path = cached_download(
                    url=url,
                    library_name="flair",
                    library_version=flair.__version__,
                    cache_dir=flair.cache_root / "models" / model_folder,
                )
            except HTTPError:
                # output information
                log.error("-" * 80)
                log.error(
                    f"ACHTUNG: The key '{model_name}' was neither found on the ModelHub nor is this a valid path to a file on your system!"
                )
                # log.error(f" - Error message: {e}")
                log.error(
                    " -> Please check https://huggingface.co/models?filter=flair for all available models."
                )
                log.error(
                    " -> Alternatively, point to a model file on your local drive."
                )
                log.error("-" * 80)
                Path(flair.cache_root / "models" /
                     model_folder).rmdir()  # remove folder again if not valid

        return model_path
Ejemplo n.º 4
0
    def _fetch_model(model_name) -> str:

        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        model_map = {
            # English NER models
            "ner": "/".join([hu_path, "ner", "en-ner-conll03-v0.4.pt"]),
            "ner-pooled": "/".join([hu_path, "ner-pooled", "en-ner-conll03-pooled-v0.5.pt"]),
            "ner-fast": "/".join([hu_path, "ner-fast", "en-ner-fast-conll03-v0.4.pt"]),
            "ner-ontonotes": "/".join([hu_path, "ner-ontonotes", "en-ner-ontonotes-v0.4.pt"]),
            "ner-ontonotes-fast": "/".join([hu_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt"]),
            # Multilingual NER models
            "ner-multi": "/".join([hu_path, "multi-ner", "quadner-large.pt"]),
            "multi-ner": "/".join([hu_path, "multi-ner", "quadner-large.pt"]),
            "ner-multi-fast": "/".join([hu_path, "multi-ner-fast", "ner-multi-fast.pt"]),
            # English POS models
            "upos": "/".join([hu_path, "upos", "en-pos-ontonotes-v0.4.pt"]),
            "upos-fast": "/".join([hu_path, "upos-fast", "en-upos-ontonotes-fast-v0.4.pt"]),
            "pos": "/".join([hu_path, "pos", "en-pos-ontonotes-v0.5.pt"]),
            "pos-fast": "/".join([hu_path, "pos-fast", "en-pos-ontonotes-fast-v0.5.pt"]),
            # Multilingual POS models
            "pos-multi": "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]),
            "multi-pos": "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]),
            "pos-multi-fast": "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]),
            "multi-pos-fast": "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]),
            # English SRL models
            "frame": "/".join([hu_path, "frame", "en-frame-ontonotes-v0.4.pt"]),
            "frame-fast": "/".join([hu_path, "frame-fast", "en-frame-ontonotes-fast-v0.4.pt"]),
            # English chunking models
            "chunk": "/".join([hu_path, "chunk", "en-chunk-conll2000-v0.4.pt"]),
            "chunk-fast": "/".join([hu_path, "chunk-fast", "en-chunk-conll2000-fast-v0.4.pt"]),
            # Danish models
            "da-pos": "/".join([hu_path, "da-pos", "da-pos-v0.1.pt"]),
            "da-ner": "/".join([hu_path, "NER-danish", "da-ner-v0.1.pt"]),
            # German models
            "de-pos": "/".join([hu_path, "de-pos", "de-pos-ud-hdt-v0.5.pt"]),
            "de-pos-tweets": "/".join([hu_path, "de-pos-tweets", "de-pos-twitter-v0.1.pt"]),
            "de-ner": "/".join([hu_path, "de-ner", "de-ner-conll03-v0.4.pt"]),
            "de-ner-germeval": "/".join([hu_path, "de-ner-germeval", "de-ner-germeval-0.4.1.pt"]),
            # French models
            "fr-ner": "/".join([hu_path, "fr-ner", "fr-ner-wikiner-0.4.pt"]),
            # Dutch models
            "nl-ner": "/".join([hu_path, "nl-ner", "nl-ner-bert-conll02-v0.5b.pt"]),
            "nl-ner-rnn": "/".join([hu_path, "nl-ner-rnn", "nl-ner-conll02-v0.5.pt"]),
            # Malayalam models
            "ml-pos": "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-xpos-model.pt",
            "ml-upos": "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-upos-model.pt",
            # Portuguese models
            "pt-pos-clinical": "/".join([hu_path, "pt-pos-clinical", "pucpr-flair-clinical-pos-tagging-best-model.pt"]),
            # Keyphase models
            "keyphrase": "/".join([hu_path, "keyphrase", "keyphrase-en-scibert.pt"]),
            "negation-speculation": "/".join(
                [hu_path, "negation-speculation", "negation-speculation-model.pt"]),
            # Biomedical models
            "hunflair-paper-cellline": "/".join(
                [hu_path, "hunflair_smallish_models", "cellline", "hunflair-celline-v1.0.pt"]
            ),
            "hunflair-paper-chemical": "/".join(
                [hu_path, "hunflair_smallish_models", "chemical", "hunflair-chemical-v1.0.pt"]
            ),
            "hunflair-paper-disease": "/".join(
                [hu_path, "hunflair_smallish_models", "disease", "hunflair-disease-v1.0.pt"]
            ),
            "hunflair-paper-gene": "/".join(
                [hu_path, "hunflair_smallish_models", "gene", "hunflair-gene-v1.0.pt"]
            ),
            "hunflair-paper-species": "/".join(
                [hu_path, "hunflair_smallish_models", "species", "hunflair-species-v1.0.pt"]
            ),
            "hunflair-cellline": "/".join(
                [hu_path, "hunflair_smallish_models", "cellline", "hunflair-celline-v1.0.pt"]
            ),
            "hunflair-chemical": "/".join(
                [hu_path, "hunflair_allcorpus_models", "huner-chemical", "hunflair-chemical-full-v1.0.pt"]
            ),
            "hunflair-disease": "/".join(
                [hu_path, "hunflair_allcorpus_models", "huner-disease", "hunflair-disease-full-v1.0.pt"]
            ),
            "hunflair-gene": "/".join(
                [hu_path, "hunflair_allcorpus_models", "huner-gene", "hunflair-gene-full-v1.0.pt"]
            ),
            "hunflair-species": "/".join(
                [hu_path, "hunflair_allcorpus_models", "huner-species", "hunflair-species-full-v1.1.pt"]
            )}

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name], cache_dir=cache_dir)

        # the historical German taggers by the @redewiegergabe project
        if model_name == "de-historic-indirect":
            model_file = Path(flair.cache_root) / cache_dir / 'indirect' / 'final-model.pt'
            if not model_file.exists():
                cached_path('http://www.redewiedergabe.de/models/indirect.zip', cache_dir=cache_dir)
                unzip_file(Path(flair.cache_root) / cache_dir / 'indirect.zip', Path(flair.cache_root) / cache_dir)
            model_name = str(Path(flair.cache_root) / cache_dir / 'indirect' / 'final-model.pt')

        if model_name == "de-historic-direct":
            model_file = Path(flair.cache_root) / cache_dir / 'direct' / 'final-model.pt'
            if not model_file.exists():
                cached_path('http://www.redewiedergabe.de/models/direct.zip', cache_dir=cache_dir)
                unzip_file(Path(flair.cache_root) / cache_dir / 'direct.zip', Path(flair.cache_root) / cache_dir)
            model_name = str(Path(flair.cache_root) / cache_dir / 'direct' / 'final-model.pt')

        if model_name == "de-historic-reported":
            model_file = Path(flair.cache_root) / cache_dir / 'reported' / 'final-model.pt'
            if not model_file.exists():
                cached_path('http://www.redewiedergabe.de/models/reported.zip', cache_dir=cache_dir)
                unzip_file(Path(flair.cache_root) / cache_dir / 'reported.zip', Path(flair.cache_root) / cache_dir)
            model_name = str(Path(flair.cache_root) / cache_dir / 'reported' / 'final-model.pt')

        if model_name == "de-historic-free-indirect":
            model_file = Path(flair.cache_root) / cache_dir / 'freeIndirect' / 'final-model.pt'
            if not model_file.exists():
                cached_path('http://www.redewiedergabe.de/models/freeIndirect.zip', cache_dir=cache_dir)
                unzip_file(Path(flair.cache_root) / cache_dir / 'freeIndirect.zip', Path(flair.cache_root) / cache_dir)
            model_name = str(Path(flair.cache_root) / cache_dir / 'freeIndirect' / 'final-model.pt')

        return model_name
Ejemplo n.º 5
0
    def _fetch_model(model_name) -> str:

        model_map = {}

        aws_resource_path_v04 = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4"
        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        model_map["ner"] = "/".join([
            aws_resource_path_v04, "NER-conll03-english",
            "en-ner-conll03-v0.4.pt"
        ])

        model_map["ner-fast"] = "/".join([
            aws_resource_path_v04,
            "NER-conll03--h256-l1-b32-p3-0.5-%2Bglove%2Bnews-forward-fast%2Bnews-backward-fast-normal-locked0.5-word0.05--release_4",
            "en-ner-fast-conll03-v0.4.pt",
        ])

        model_map["ner-ontonotes"] = "/".join([
            aws_resource_path_v04,
            "release-ner-ontonotes-0",
            "en-ner-ontonotes-v0.4.pt",
        ])

        model_map["ner-ontonotes-fast"] = "/".join([
            aws_resource_path_v04,
            "release-ner-ontonotes-fast-0",
            "en-ner-ontonotes-fast-v0.4.pt",
        ])

        for key in ["ner-multi", "multi-ner"]:
            model_map[key] = "/".join([
                aws_resource_path_v04,
                "release-quadner-512-l2-multi-embed",
                "quadner-large.pt",
            ])

        for key in ["ner-multi-fast", "multi-ner-fast"]:
            model_map[key] = "/".join(
                [aws_resource_path_v04, "NER-multi-fast", "ner-multi-fast.pt"])

        for key in ["ner-multi-fast-learn", "multi-ner-fast-learn"]:
            model_map[key] = "/".join([
                aws_resource_path_v04,
                "NER-multi-fast-evolve",
                "ner-multi-fast-learn.pt",
            ])

        model_map["upos"] = "/".join([
            aws_resource_path_v04,
            "POS-ontonotes--h256-l1-b32-p3-0.5-%2Bglove%2Bnews-forward%2Bnews-backward-normal-locked0.5-word0.05--v0.4_0",
            "en-pos-ontonotes-v0.4.pt",
        ])

        model_map["pos"] = "/".join([
            hu_path,
            "release-pos-0",
            "en-pos-ontonotes-v0.5.pt",
        ])

        model_map["upos-fast"] = "/".join([
            aws_resource_path_v04,
            "release-pos-fast-0",
            "en-pos-ontonotes-fast-v0.4.pt",
        ])

        model_map["pos-fast"] = "/".join([
            hu_path,
            "release-pos-fast-0",
            "en-pos-ontonotes-fast-v0.5.pt",
        ])

        for key in ["pos-multi", "multi-pos"]:
            model_map[key] = "/".join([
                aws_resource_path_v04,
                "release-dodekapos-512-l2-multi",
                "pos-multi-v0.1.pt",
            ])

        for key in ["pos-multi-fast", "multi-pos-fast"]:
            model_map[key] = "/".join([
                aws_resource_path_v04, "UPOS-multi-fast", "pos-multi-fast.pt"
            ])

        model_map["frame"] = "/".join([
            aws_resource_path_v04, "release-frame-1",
            "en-frame-ontonotes-v0.4.pt"
        ])

        model_map["frame-fast"] = "/".join([
            aws_resource_path_v04,
            "release-frame-fast-0",
            "en-frame-ontonotes-fast-v0.4.pt",
        ])

        model_map["chunk"] = "/".join([
            aws_resource_path_v04,
            "NP-conll2000--h256-l1-b32-p3-0.5-%2Bnews-forward%2Bnews-backward-normal-locked0.5-word0.05--v0.4_0",
            "en-chunk-conll2000-v0.4.pt",
        ])

        model_map["chunk-fast"] = "/".join([
            aws_resource_path_v04,
            "release-chunk-fast-0",
            "en-chunk-conll2000-fast-v0.4.pt",
        ])

        model_map["da-pos"] = "/".join(
            [aws_resource_path_v04, "POS-danish", "da-pos-v0.1.pt"])

        model_map["da-ner"] = "/".join(
            [aws_resource_path_v04, "NER-danish", "da-ner-v0.1.pt"])

        model_map["de-pos"] = "/".join(
            [hu_path, "release-de-pos-0", "de-pos-ud-hdt-v0.5.pt"])

        model_map["de-pos-tweets"] = "/".join([
            aws_resource_path_v04,
            "POS-fine-grained-german-tweets",
            "de-pos-twitter-v0.1.pt",
        ])

        model_map["de-ner"] = "/".join([
            aws_resource_path_v04, "release-de-ner-0", "de-ner-conll03-v0.4.pt"
        ])

        model_map["de-ner-germeval"] = "/".join([
            aws_resource_path_v04, "NER-germeval", "de-ner-germeval-0.4.1.pt"
        ])

        model_map["fr-ner"] = "/".join([
            aws_resource_path_v04, "release-fr-ner-0", "fr-ner-wikiner-0.4.pt"
        ])
        model_map["nl-ner"] = "/".join([
            aws_resource_path_v04, "NER-conll2002-dutch",
            "nl-ner-conll02-v0.1.pt"
        ])
        model_map[
            "ml-pos"] = "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-xpos-model.pt"
        model_map[
            "ml-upos"] = "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-upos-model.pt"

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name],
                                     cache_dir=cache_dir)

        # the historical German taggers by the @redewiegergabe project
        if model_name == "de-historic-indirect":
            model_file = Path(
                flair.cache_root) / cache_dir / 'indirect' / 'final-model.pt'
            if not model_file.exists():
                cached_path('http://www.redewiedergabe.de/models/indirect.zip',
                            cache_dir=cache_dir)
                unzip_file(
                    Path(flair.cache_root) / cache_dir / 'indirect.zip',
                    Path(flair.cache_root) / cache_dir)
            model_name = str(
                Path(flair.cache_root) / cache_dir / 'indirect' /
                'final-model.pt')

        if model_name == "de-historic-direct":
            model_file = Path(
                flair.cache_root) / cache_dir / 'direct' / 'final-model.pt'
            if not model_file.exists():
                cached_path('http://www.redewiedergabe.de/models/direct.zip',
                            cache_dir=cache_dir)
                unzip_file(
                    Path(flair.cache_root) / cache_dir / 'direct.zip',
                    Path(flair.cache_root) / cache_dir)
            model_name = str(
                Path(flair.cache_root) / cache_dir / 'direct' /
                'final-model.pt')

        if model_name == "de-historic-reported":
            model_file = Path(
                flair.cache_root) / cache_dir / 'reported' / 'final-model.pt'
            if not model_file.exists():
                cached_path('http://www.redewiedergabe.de/models/reported.zip',
                            cache_dir=cache_dir)
                unzip_file(
                    Path(flair.cache_root) / cache_dir / 'reported.zip',
                    Path(flair.cache_root) / cache_dir)
            model_name = str(
                Path(flair.cache_root) / cache_dir / 'reported' /
                'final-model.pt')

        if model_name == "de-historic-free-indirect":
            model_file = Path(flair.cache_root
                              ) / cache_dir / 'freeIndirect' / 'final-model.pt'
            if not model_file.exists():
                cached_path(
                    'http://www.redewiedergabe.de/models/freeIndirect.zip',
                    cache_dir=cache_dir)
                unzip_file(
                    Path(flair.cache_root) / cache_dir / 'freeIndirect.zip',
                    Path(flair.cache_root) / cache_dir)
            model_name = str(
                Path(flair.cache_root) / cache_dir / 'freeIndirect' /
                'final-model.pt')

        return model_name