def load(model: str): model_file = None # news-english-forward if model.lower() == 'ner': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/ner-conll03.pt' model_file = cached_path(base_path) if model.lower() == 'chunk': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/chunk-conll2000.pt' model_file = cached_path(base_path) if model_file is not None: tagger: SequenceTaggerLSTM = torch.load(model_file, map_location={'cuda:0': 'cpu'}) tagger.eval() if torch.cuda.is_available(): tagger = tagger.cuda() return tagger
def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): if type(base_path) == str: base_path: Path = Path(base_path) # column format columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary wnut_path = "https://noisy-text.github.io/2017/files/" cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) cached_path(f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name) super(WNUT_17, self).__init__(data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory)
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): if type(base_path) == str: base_path: Path = Path(base_path) # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary ud_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Bulgarian-BTB/master" cached_path(f"{ud_path}/bg_btb-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/bg_btb-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/bg_btb-ud-train.conllu", Path("datasets") / dataset_name) super(UD_BULGARIAN, self).__init__(data_folder, in_memory=in_memory)
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): if type(base_path) == str: base_path: Path = Path(base_path) # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/master" cached_path(f"{web_path}/lzh_kyoto-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lzh_kyoto-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/lzh_kyoto-ud-train.conllu", Path("datasets") / dataset_name) super(UD_CHINESE_KYOTO, self).__init__(data_folder, in_memory=in_memory)
def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): if type(base_path) == str: base_path: Path = Path(base_path) # column format columns = {1: 'text', 3: 'pos', 9: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary data_path = Path(flair.cache_root) / "datasets" / dataset_name train_data_file = data_path / "ddt.train.conllu" if not train_data_file.is_file(): temp_file = cached_path( 'https://danlp.s3.eu-central-1.amazonaws.com/datasets/ddt.zip', Path("datasets") / dataset_name ) from zipfile import ZipFile with ZipFile(temp_file, 'r') as zip_file: zip_file.extractall(path=data_path) # Remove CoNLL-U meta information in the last column for part in ['train', 'dev', 'test']: lines = [] data_file = "ddt.{}.conllu".format(part) with open(data_path / data_file, 'r') as file: for line in file: if line.startswith("#") or line == "\n": lines.append(line) lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) with open(data_path / data_file, 'w') as file: file.writelines(lines) print(data_path / data_file) super(DANE, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, comment_symbol="#" )
def __init__( self, base_path: Union[str, Path] = None, in_memory: bool = True, sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), ): if type(base_path) == str: base_path: Path = Path(base_path) self.sentence_splitter = sentence_splitter # this dataset name dataset_name = self.__class__.__name__.lower() + "_" + type( self.sentence_splitter).__name__ # default dataset folder is the cache root if not base_path: base_path = flair.cache_root / "datasets" data_folder = base_path / dataset_name drugprot_url = ( "https://zenodo.org/record/5042151/files/drugprot-gs-training-development.zip" ) data_file = data_folder / "drugprot-train.conllu" if not data_file.is_file(): source_data_folder = data_folder / "original" cached_path(drugprot_url, source_data_folder) self.extract_and_convert_to_conllu( data_file=source_data_folder / "drugprot-gs-training-development.zip", data_folder=data_folder, ) super(DrugProt, self).__init__( data_folder, in_memory=in_memory, sample_missing_splits=False, )
def load(model: str): model_file = None if model.lower() == 'ner': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/ner-conll03.pt' model_file = cached_path(base_path) if model.lower() == 'chunk': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/chunk-conll2000.pt' model_file = cached_path(base_path) if model.lower() == 'pos': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/pos-ontonotes-small.pt' model_file = cached_path(base_path) if model_file is not None: tagger: SequenceTaggerLSTM = torch.load( model_file, map_location={'cuda:0': 'cpu'}) tagger.eval() if torch.cuda.is_available(): tagger = tagger.cuda() return tagger
def load(model: str): model_file = None aws_resource_path = ( "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4" ) cache_dir = Path("models") if model.lower() == "de-offensive-language": base_path = "/".join([ aws_resource_path, "TEXT-CLASSIFICATION_germ-eval-2018_task-1", "germ-eval-2018-task-1.pt", ]) model_file = cached_path(base_path, cache_dir=cache_dir) elif model.lower() == "en-sentiment": base_path = "/".join( [aws_resource_path, "TEXT-CLASSIFICATION_imdb", "imdb.pt"]) model_file = cached_path(base_path, cache_dir=cache_dir) if model_file is not None: return TextClassifier.load_from_file(model_file)
def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, ): """ Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ if type(base_path) == str: base_path: Path = Path(base_path) # column format columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) super(LER_GERMAN, self).__init__(data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, train_file='ler.conll')
def _fetch_model(model_name) -> str: model_map = {} hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" model_map["relations-fast"] = "/".join([hu_path, "relations-fast", "relations-fast.pt"]) model_map["relations"] = "/".join([hu_path, "relations", "relations.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def _fetch_model(model_name) -> str: model_map = {} aws_resource_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4' model_map['de-offensive-language'] = '/'.join([ aws_resource_path, 'TEXT-CLASSIFICATION_germ-eval-2018_task-1', 'germ-eval-2018-task-1.pt' ]) model_map['en-sentiment'] = '/'.join( [aws_resource_path, 'TEXT-CLASSIFICATION_imdb', 'imdb.pt']) cache_dir = Path('models') if (model_name in model_map): model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): if type(base_path) == str: base_path: Path = Path(base_path) # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_North_Sami-Giella/master" cached_path( f"{web_path}/sme_giella-ud-test.conllu", Path("datasets") / dataset_name ) cached_path( f"{web_path}/sme_giella-ud-train.conllu", Path("datasets") / dataset_name ) super(UD_NORTH_SAMI, self).__init__(data_folder, in_memory=in_memory)
def _fetch_model(model_name) -> str: model_map = {} hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" model_map["tars-base"] = "/".join( [hu_path, "tars-base", "tars-base-v8.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def __init__( self, embeddings: str, use_local: bool = True, use_gensim: bool = False, field: str = None, ): """ Initializes fasttext word embeddings. Constructor downloads required embedding file and stores in cache if use_local is False. :param embeddings: path to your embeddings '.bin' file :param use_local: set this to False if you are using embeddings from a remote source :param use_gensim: set this to true if your fasttext embedding is trained with fasttext version below 0.9.1 """ cache_dir = Path("embeddings") if use_local: if not Path(embeddings).exists(): raise ValueError( f'The given embeddings "{embeddings}" is not available or is not a valid path.' ) else: embeddings = cached_path(f"{embeddings}", cache_dir=cache_dir) self.embeddings = embeddings self.name: str = str(embeddings) self.static_embeddings = True self.use_gensim = use_gensim if use_gensim: self.precomputed_word_embeddings = gensim.models.FastText.load_fasttext_format( str(embeddings)) self.__embedding_length: int = self.precomputed_word_embeddings.vector_size else: self.precomputed_word_embeddings = ft.load_model(str(embeddings)) self.__embedding_length: int = self.precomputed_word_embeddings.get_dimension( ) self.field = field super().__init__()
def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, document_as_sequence: bool = False, ): """ Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict POS tags instead :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ if type(base_path) == str: base_path: Path = Path(base_path) # column format columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" cached_path(f"{conll_02_path}ned.testa", Path("datasets") / dataset_name) cached_path(f"{conll_02_path}ned.testb", Path("datasets") / dataset_name) cached_path(f"{conll_02_path}ned.train", Path("datasets") / dataset_name) super(CONLL_03_DUTCH, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="latin-1", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", )
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False): if type(base_path) == str: base_path: Path = Path(base_path) # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary ud_path = ( "https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/dev" ) cached_path(f"{ud_path}/de_hdt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{ud_path}/de_hdt-ud-test.conllu", Path("datasets") / dataset_name) train_filenames = [ "de_hdt-ud-train-a-1.conllu", "de_hdt-ud-train-a-2.conllu", "de_hdt-ud-train-b-1.conllu", "de_hdt-ud-train-b-2.conllu", ] for train_file in train_filenames: cached_path(f"{ud_path}/{train_file}", Path("datasets") / dataset_name / "original") data_path = Path(flair.cache_root) / "datasets" / dataset_name new_train_file: Path = data_path / "de_hdt-ud-train-all.conllu" if not new_train_file.is_file(): with open(new_train_file, "wt") as f_out: for train_filename in train_filenames: with open(data_path / "original" / train_filename, "rt") as f_in: f_out.write(f_in.read()) super(UD_GERMAN_HDT, self).__init__(data_folder, in_memory=in_memory)
def evaluate(test_file, model_file, dataset_format='macss', semeval_scoring=False): if semeval_scoring: eval_script = cached_path( 'https://raw.githubusercontent.com/vzhong/semeval/master/dataset/SemEval2010_task8_scorer-v1.2/semeval2010_task8_scorer-v1.2.pl', cache_dir='scripts') chmod(eval_script, 0o777) classifier: TextClassifier = TextClassifier.load_from_file(model_file) #sentences_test: List[Sentence] = load_sentences_jsonl(test_file, attach_id=True) idx2item = load_idx2item(join(dirname(test_file), 'vocabulary/embeddings.csv')) load_dataset = dataset_loader[dataset_format] sentences_test: List[Sentence] = load_dataset(test_file, idx2item, is_test=False, attach_id=True) sentences_pred: List[Sentence] = load_dataset(test_file, idx2item, is_test=True, attach_id=True) sentences_pred = classifier.predict(sentences_pred) if semeval_scoring: id_labels_true = [(sentence.id_, sentence.labels[0]) for sentence in sentences_test] id_labels_pred = [(sentence.id_, sentence.labels[0]) for sentence in sentences_pred] input_files = [] for id_labels in [id_labels_true, id_labels_pred]: tmp_file = NamedTemporaryFile(delete=True) input_files.append(tmp_file) with open(tmp_file.name, 'w') as f: for id_, label in id_labels: f.write('{}\t{}\n'.format(id_, label.name)) tmp_file.file.close() p = run([eval_script, input_files[0].name, input_files[1].name], stdout=PIPE, encoding='utf-8') main_result = p.stdout print(main_result) else: y_true = [sentence.labels[0].name for sentence in sentences_test] y_pred = [sentence.labels[0].name for sentence in sentences_pred] print(classification_report(y_true, y_pred))
def _fetch_model(model_name) -> str: model_map = {} aws_resource_path = ( "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4" ) model_map["de-offensive-language"] = "/".join([ aws_resource_path, "classy-offensive-de-rnn-cuda%3A0", "germ-eval-2018-task-1-v0.4.pt", ]) model_map["en-sentiment"] = "/".join( [aws_resource_path, "classy-imdb-en-rnn-cuda%3A0", "imdb-v0.4.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def _fetch_model(model_name) -> str: model_map = {} aws_resource_path = ( "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4" ) model_map["de-offensive-language"] = "/".join([ aws_resource_path, "TEXT-CLASSIFICATION_germ-eval-2018_task-1", "germ-eval-2018-task-1.pt", ]) model_map["en-sentiment"] = "/".join( [aws_resource_path, "TEXT-CLASSIFICATION_imdb", "imdb.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def __init__(self, **kwargs): dataset = "feidegger" # cache Feidegger config file json_link = "https://raw.githubusercontent.com/zalandoresearch/feidegger/master/data/FEIDEGGER_release_1.1.json" json_local_path = cached_path(json_link, Path("datasets") / dataset) # cache Feidegger images dataset_info = json.load(open(json_local_path, "r")) images_cache_folder = os.path.join(os.path.dirname(json_local_path), "images") if not os.path.isdir(images_cache_folder): os.mkdir(images_cache_folder) for image_info in tqdm(dataset_info): name = os.path.basename(image_info["url"]) filename = os.path.join(images_cache_folder, name) if not os.path.isfile(filename): urllib.request.urlretrieve(image_info["url"], filename) # replace image URL with local cached file image_info["url"] = filename feidegger_dataset: Dataset = FeideggerDataset(dataset_info, **kwargs) train_indices = list( np.where(np.in1d(feidegger_dataset.split, list(range(8))))[0]) train = torch.utils.data.dataset.Subset(feidegger_dataset, train_indices) dev_indices = list(np.where(np.in1d(feidegger_dataset.split, [8]))[0]) dev = torch.utils.data.dataset.Subset(feidegger_dataset, dev_indices) test_indices = list(np.where(np.in1d(feidegger_dataset.split, [9]))[0]) test = torch.utils.data.dataset.Subset(feidegger_dataset, test_indices) super(FeideggerCorpus, self).__init__(train, dev, test, name="feidegger")
def _fetch_model(model_name) -> str: model_map = {} aws_resource_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4" hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" model_map["de-offensive-language"] = "/".join([ aws_resource_path, "classy-offensive-de-rnn-cuda%3A0", "germ-eval-2018-task-1-v0.4.pt", ]) # English sentiment models model_map["sentiment"] = "/".join([ hu_path, "sentiment-curated-distilbert", "sentiment-en-mix-distillbert.pt" ]) model_map["en-sentiment"] = "/".join([ hu_path, "sentiment-curated-distilbert", "sentiment-en-mix-distillbert.pt" ]) model_map["sentiment-fast"] = "/".join([ hu_path, "sentiment-curated-fasttext-rnn", "sentiment-en-mix-ft-rnn.pt" ]) #Communicative Functions Model model_map["communicative-functions"] = "/".join( [hu_path, "comfunc", "communicative-functions-v0.5b.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def _fetch_model(model_name) -> str: model_map = {} hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" model_map["de-offensive-language"] = "/".join( [hu_path, "de-offensive-language", "germ-eval-2018-task-1-v0.8.pt"] ) # English sentiment models model_map["sentiment"] = "/".join( [ hu_path, "sentiment-curated-distilbert", "sentiment-en-mix-distillbert_4.pt", ] ) model_map["en-sentiment"] = "/".join( [ hu_path, "sentiment-curated-distilbert", "sentiment-en-mix-distillbert_4.pt", ] ) model_map["sentiment-fast"] = "/".join( [hu_path, "sentiment-curated-fasttext-rnn", "sentiment-en-mix-ft-rnn_v8.pt"] ) # Communicative Functions Model model_map["communicative-functions"] = "/".join([hu_path, "comfunc", "communicative-functions.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def download_dataset(task: NLPTask): # conll 2000 chunking task if task == NLPTask.CONLL_2000: conll_2000_path = 'https://www.clips.uantwerpen.be/conll2000/chunking/' data_file = Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'train.txt' if not data_file.is_file(): cached_path(f'{conll_2000_path}train.txt.gz', Path('datasets') / task.value) cached_path(f'{conll_2000_path}test.txt.gz', Path('datasets') / task.value) import gzip, shutil with gzip.open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'train.txt.gz', 'rb') as f_in: with open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'train.txt', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) with gzip.open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'test.txt.gz', 'rb') as f_in: with open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'test.txt', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if task == NLPTask.IMDB: imdb_acl_path = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' data_path = Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value data_file = data_path / 'train.txt' if not data_file.is_file(): cached_path(imdb_acl_path, Path('datasets') / task.value) import tarfile with tarfile.open(Path(flair.file_utils.CACHE_ROOT) / 'datasets' / task.value / 'aclImdb_v1.tar.gz', 'r:gz') as f_in: datasets = ['train', 'test'] labels = ['pos', 'neg'] for label in labels: for dataset in datasets: f_in.extractall(data_path, members=[m for m in f_in.getmembers() if f'{dataset}/{label}' in m.name]) with open(f'{data_path}/{dataset}.txt', 'at') as f_p: current_path = data_path / 'aclImdb' / dataset / label for file_name in current_path.iterdir(): if file_name.is_file() and file_name.name.endswith('.txt'): f_p.write(f'__label__{label} ' + file_name.open('rt', encoding='utf-8').read() + '\n')
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): if not base_path: base_path = flair.cache_root / "datasets" else: base_path = Path(base_path) # this dataset name dataset_name = self.__class__.__name__.lower() data_folder = base_path / dataset_name # TODO: change data source to original CoNLL04 -- this dataset has span formatting errors # download data if necessary conll04_url = ( "https://raw.githubusercontent.com/bekou/multihead_joint_entity_relation_extraction/master/data/CoNLL04/" ) data_file = data_folder / "conll04-train.conllu" if True or not data_file.is_file(): source_data_folder = data_folder / "original" cached_path(f"{conll04_url}train.txt", source_data_folder) cached_path(f"{conll04_url}dev.txt", source_data_folder) cached_path(f"{conll04_url}test.txt", source_data_folder) self.convert_to_conllu( source_data_folder=source_data_folder, data_folder=data_folder, ) super(RE_ENGLISH_CONLL04, self).__init__( data_folder, in_memory=in_memory, column_format={ 1: "text", 2: "ner" }, comment_symbol="# ", )
def __init__( self, **corpusargs, ): # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root data_folder = Path(flair.cache_root) / "datasets" / dataset_name # download data if necessary if not (data_folder / "train.txt").is_file(): # download senteval datasets if necessary und unzip cached_path( 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.train', Path("datasets") / dataset_name / 'raw') cached_path( 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.test', Path("datasets") / dataset_name / 'raw') cached_path( 'https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/stsa.fine.dev', Path("datasets") / dataset_name / 'raw') # convert to FastText format for split in ['train', 'dev', 'test']: with open(data_folder / f"{split}.txt", "w") as train_file: with open(data_folder / 'raw' / f'stsa.fine.{split}', encoding="latin1") as file: for line in file: train_file.write(f"__label__{line[0]} {line[2:]}") super(SENTEVAL_SST_GRANULAR, self).__init__( data_folder, tokenizer=segtok_tokenizer, **corpusargs, )
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): if type(base_path) == str: base_path: Path = Path(base_path) # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Maltese-MUDT/master" cached_path(f"{web_path}/mt_mudt-ud-dev.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mt_mudt-ud-test.conllu", Path("datasets") / dataset_name) cached_path(f"{web_path}/mt_mudt-ud-train.conllu", Path("datasets") / dataset_name) super(UD_MALTESE, self).__init__(data_folder, in_memory=in_memory)
def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): if type(base_path) == str: base_path: Path = Path(base_path) # column format columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) _remove_lines_without_annotations( data_file=Path(data_folder / "digitoday.2015.test.csv")) super(NER_FINNISH, self).__init__(data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True)
def _fetch_model(model_name) -> str: model_map = {} aws_resource_path_v04 = ( "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4" ) model_map["ner"] = "/".join( [aws_resource_path_v04, "NER-conll03-english", "en-ner-conll03-v0.4.pt"] ) model_map["ner-fast"] = "/".join( [ aws_resource_path_v04, "NER-conll03--h256-l1-b32-p3-0.5-%2Bglove%2Bnews-forward-fast%2Bnews-backward-fast-normal-locked0.5-word0.05--release_4", "en-ner-fast-conll03-v0.4.pt", ] ) model_map["ner-ontonotes"] = "/".join( [ aws_resource_path_v04, "release-ner-ontonotes-0", "en-ner-ontonotes-v0.4.pt", ] ) model_map["ner-ontonotes-fast"] = "/".join( [ aws_resource_path_v04, "release-ner-ontonotes-fast-0", "en-ner-ontonotes-fast-v0.4.pt", ] ) for key in ["ner-multi", "multi-ner"]: model_map[key] = "/".join( [ aws_resource_path_v04, "release-quadner-512-l2-multi-embed", "quadner-large.pt", ] ) for key in ["ner-multi-fast", "multi-ner-fast"]: model_map[key] = "/".join( [aws_resource_path_v04, "NER-multi-fast", "ner-multi-fast.pt"] ) for key in ["ner-multi-fast-learn", "multi-ner-fast-learn"]: model_map[key] = "/".join( [ aws_resource_path_v04, "NER-multi-fast-evolve", "ner-multi-fast-learn.pt", ] ) model_map["pos"] = "/".join( [ aws_resource_path_v04, "POS-ontonotes--h256-l1-b32-p3-0.5-%2Bglove%2Bnews-forward%2Bnews-backward-normal-locked0.5-word0.05--v0.4_0", "en-pos-ontonotes-v0.4.pt", ] ) model_map["pos-fast"] = "/".join( [ aws_resource_path_v04, "release-pos-fast-0", "en-pos-ontonotes-fast-v0.4.pt", ] ) for key in ["pos-multi", "multi-pos"]: model_map[key] = "/".join( [ aws_resource_path_v04, "release-dodekapos-512-l2-multi", "pos-multi-v0.1.pt", ] ) for key in ["pos-multi-fast", "multi-pos-fast"]: model_map[key] = "/".join( [aws_resource_path_v04, "UPOS-multi-fast", "pos-multi-fast.pt"] ) model_map["frame"] = "/".join( [aws_resource_path_v04, "release-frame-1", "en-frame-ontonotes-v0.4.pt"] ) model_map["frame-fast"] = "/".join( [ aws_resource_path_v04, "release-frame-fast-0", "en-frame-ontonotes-fast-v0.4.pt", ] ) model_map["chunk"] = "/".join( [ aws_resource_path_v04, "NP-conll2000--h256-l1-b32-p3-0.5-%2Bnews-forward%2Bnews-backward-normal-locked0.5-word0.05--v0.4_0", "en-chunk-conll2000-v0.4.pt", ] ) model_map["chunk-fast"] = "/".join( [ aws_resource_path_v04, "release-chunk-fast-0", "en-chunk-conll2000-fast-v0.4.pt", ] ) model_map["da-pos"] = "/".join( [aws_resource_path_v04, "POS-danish", "da-pos-v0.1.pt"] ) model_map["da-ner"] = "/".join( [aws_resource_path_v04, "NER-danish", "da-ner-v0.1.pt"] ) model_map["de-pos"] = "/".join( [aws_resource_path_v04, "release-de-pos-0", "de-pos-ud-hdt-v0.4.pt"] ) model_map["de-pos-fine-grained"] = "/".join( [ aws_resource_path_v04, "POS-fine-grained-german-tweets", "de-pos-twitter-v0.1.pt", ] ) model_map["de-ner"] = "/".join( [aws_resource_path_v04, "release-de-ner-0", "de-ner-conll03-v0.4.pt"] ) model_map["de-ner-germeval"] = "/".join( [aws_resource_path_v04, "NER-germeval", "de-ner-germeval-0.4.1.pt"] ) model_map["fr-ner"] = "/".join( [aws_resource_path_v04, "release-fr-ner-0", "fr-ner-wikiner-0.4.pt"] ) model_map["nl-ner"] = "/".join( [aws_resource_path_v04, "NER-conll2002-dutch", "nl-ner-conll02-v0.1.pt"] ) model_map["ml-pos"] = "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-upos-model.pt" model_map["ml-xpos"] = "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-xpos-model.pt" cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def _fetch_model(model_name) -> str: model_map = {} aws_resource_path = ( "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.2" ) aws_resource_path_v04 = ( "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4" ) model_map["ner"] = "/".join( [aws_resource_path_v04, "NER-conll03-english", "en-ner-conll03-v0.4.pt"] ) model_map["ner-fast"] = "/".join( [ aws_resource_path, "NER-conll03--h256-l1-b32-experimental--fast-v0.2", "en-ner-fast-conll03-v0.2.pt", ] ) model_map["ner-ontonotes"] = "/".join( [ aws_resource_path, "NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward%2Bnews-backward--v0.2", "en-ner-ontonotes-v0.3.pt", ] ) model_map["ner-ontonotes-fast"] = "/".join( [ aws_resource_path, "NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward-fast%2Bnews-backward-fast--v0.2", "en-ner-ontonotes-fast-v0.3.pt", ] ) for key in ["ner-multi", "multi-ner"]: model_map[key] = "/".join( [ aws_resource_path_v04, "release-quadner-512-l2-multi-embed", "quadner-large.pt", ] ) for key in ["ner-multi-fast", "multi-ner-fast"]: model_map[key] = "/".join( [aws_resource_path_v04, "NER-multi-fast", "ner-multi-fast.pt"] ) for key in ["ner-multi-fast-learn", "multi-ner-fast-learn"]: model_map[key] = "/".join( [ aws_resource_path_v04, "NER-multi-fast-evolve", "ner-multi-fast-learn.pt", ] ) model_map["pos"] = "/".join( [ aws_resource_path, "POS-ontonotes--h256-l1-b32-%2Bmix-forward%2Bmix-backward--v0.2", "en-pos-ontonotes-v0.2.pt", ] ) model_map["pos-fast"] = "/".join( [ aws_resource_path, "POS-ontonotes--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2", "en-pos-ontonotes-fast-v0.2.pt", ] ) for key in ["pos-multi", "multi-pos"]: model_map[key] = "/".join( [ aws_resource_path_v04, "release-dodekapos-512-l2-multi", "pos-multi-v0.1.pt", ] ) for key in ["pos-multi-fast", "multi-pos-fast"]: model_map[key] = "/".join( [aws_resource_path_v04, "UPOS-multi-fast", "pos-multi-fast.pt"] ) model_map["frame"] = "/".join( [ aws_resource_path, "FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward%2Bnews-backward--v0.2", "en-frame-ontonotes-v0.2.pt", ] ) model_map["frame-fast"] = "/".join( [ aws_resource_path, "FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward-fast%2Bnews-backward-fast--v0.2", "en-frame-ontonotes-fast-v0.2.pt", ] ) model_map["chunk"] = "/".join( [ aws_resource_path, "NP-conll2000--h256-l1-b32-%2Bnews-forward%2Bnews-backward--v0.2", "en-chunk-conll2000-v0.2.pt", ] ) model_map["chunk-fast"] = "/".join( [ aws_resource_path, "NP-conll2000--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2", "en-chunk-conll2000-fast-v0.2.pt", ] ) model_map["de-pos"] = "/".join( [ aws_resource_path, "UPOS-udgerman--h256-l1-b8-%2Bgerman-forward%2Bgerman-backward--v0.2", "de-pos-ud-v0.2.pt", ] ) model_map["de-pos-fine-grained"] = "/".join( [ aws_resource_path_v04, "POS-fine-grained-german-tweets", "de-pos-twitter-v0.1.pt", ] ) model_map["de-ner"] = "/".join( [ aws_resource_path, "NER-conll03ger--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2", "de-ner-conll03-v0.3.pt", ] ) model_map["de-ner-germeval"] = "/".join( [ aws_resource_path, "NER-germeval--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2", "de-ner-germeval-v0.3.pt", ] ) model_map["fr-ner"] = "/".join( [aws_resource_path, "NER-aij-wikiner-fr-wp3", "fr-ner.pt"] ) model_map["nl-ner"] = "/".join( [aws_resource_path_v04, "NER-conll2002-dutch", "nl-ner-conll02-v0.1.pt"] ) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name
def _fetch_model(model_name) -> str: # core Flair models on Huggingface ModelHub huggingface_model_map = { "ner": "flair/ner-english", "ner-fast": "flair/ner-english-fast", "ner-ontonotes": "flair/ner-english-ontonotes", "ner-ontonotes-fast": "flair/ner-english-ontonotes-fast", # Large NER models, "ner-large": "flair/ner-english-large", "ner-ontonotes-large": "flair/ner-english-ontonotes-large", "de-ner-large": "flair/ner-german-large", "nl-ner-large": "flair/ner-dutch-large", "es-ner-large": "flair/ner-spanish-large", # Multilingual NER models "ner-multi": "flair/ner-multi", "multi-ner": "flair/ner-multi", "ner-multi-fast": "flair/ner-multi-fast", # English POS models "upos": "flair/upos-english", "upos-fast": "flair/upos-english-fast", "pos": "flair/pos-english", "pos-fast": "flair/pos-english-fast", # Multilingual POS models "pos-multi": "flair/upos-multi", "multi-pos": "flair/upos-multi", "pos-multi-fast": "flair/upos-multi-fast", "multi-pos-fast": "flair/upos-multi-fast", # English SRL models "frame": "flair/frame-english", "frame-fast": "flair/frame-english-fast", # English chunking models "chunk": "flair/chunk-english", "chunk-fast": "flair/chunk-english-fast", # Language-specific NER models "da-ner": "flair/ner-danish", "de-ner": "flair/ner-german", "de-ler": "flair/ner-german-legal", "de-ner-legal": "flair/ner-german-legal", "fr-ner": "flair/ner-french", "nl-ner": "flair/ner-dutch", } hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" hu_model_map = { # English NER models "ner": "/".join([hu_path, "ner", "en-ner-conll03-v0.4.pt"]), "ner-pooled": "/".join([hu_path, "ner-pooled", "en-ner-conll03-pooled-v0.5.pt"]), "ner-fast": "/".join([hu_path, "ner-fast", "en-ner-fast-conll03-v0.4.pt"]), "ner-ontonotes": "/".join([hu_path, "ner-ontonotes", "en-ner-ontonotes-v0.4.pt"]), "ner-ontonotes-fast": "/".join([ hu_path, "ner-ontonotes-fast", "en-ner-ontonotes-fast-v0.4.pt" ]), # Multilingual NER models "ner-multi": "/".join([hu_path, "multi-ner", "quadner-large.pt"]), "multi-ner": "/".join([hu_path, "multi-ner", "quadner-large.pt"]), "ner-multi-fast": "/".join([hu_path, "multi-ner-fast", "ner-multi-fast.pt"]), # English POS models "upos": "/".join([hu_path, "upos", "en-pos-ontonotes-v0.4.pt"]), "upos-fast": "/".join([hu_path, "upos-fast", "en-upos-ontonotes-fast-v0.4.pt"]), "pos": "/".join([hu_path, "pos", "en-pos-ontonotes-v0.5.pt"]), "pos-fast": "/".join([hu_path, "pos-fast", "en-pos-ontonotes-fast-v0.5.pt"]), # Multilingual POS models "pos-multi": "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]), "multi-pos": "/".join([hu_path, "multi-pos", "pos-multi-v0.1.pt"]), "pos-multi-fast": "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]), "multi-pos-fast": "/".join([hu_path, "multi-pos-fast", "pos-multi-fast.pt"]), # English SRL models "frame": "/".join([hu_path, "frame", "en-frame-ontonotes-v0.4.pt"]), "frame-fast": "/".join( [hu_path, "frame-fast", "en-frame-ontonotes-fast-v0.4.pt"]), # English chunking models "chunk": "/".join([hu_path, "chunk", "en-chunk-conll2000-v0.4.pt"]), "chunk-fast": "/".join( [hu_path, "chunk-fast", "en-chunk-conll2000-fast-v0.4.pt"]), # Danish models "da-pos": "/".join([hu_path, "da-pos", "da-pos-v0.1.pt"]), "da-ner": "/".join([hu_path, "NER-danish", "da-ner-v0.1.pt"]), # German models "de-pos": "/".join([hu_path, "de-pos", "de-pos-ud-hdt-v0.5.pt"]), "de-pos-tweets": "/".join([hu_path, "de-pos-tweets", "de-pos-twitter-v0.1.pt"]), "de-ner": "/".join([hu_path, "de-ner", "de-ner-conll03-v0.4.pt"]), "de-ner-germeval": "/".join([hu_path, "de-ner-germeval", "de-ner-germeval-0.4.1.pt"]), "de-ler": "/".join([hu_path, "de-ner-legal", "de-ner-legal.pt"]), "de-ner-legal": "/".join([hu_path, "de-ner-legal", "de-ner-legal.pt"]), # French models "fr-ner": "/".join([hu_path, "fr-ner", "fr-ner-wikiner-0.4.pt"]), # Dutch models "nl-ner": "/".join([hu_path, "nl-ner", "nl-ner-bert-conll02-v0.8.pt"]), "nl-ner-rnn": "/".join([hu_path, "nl-ner-rnn", "nl-ner-conll02-v0.5.pt"]), # Malayalam models "ml-pos": "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-xpos-model.pt", "ml-upos": "https://raw.githubusercontent.com/qburst/models-repository/master/FlairMalayalamModels/malayalam-upos-model.pt", # Portuguese models "pt-pos-clinical": "/".join([ hu_path, "pt-pos-clinical", "pucpr-flair-clinical-pos-tagging-best-model.pt", ]), # Keyphase models "keyphrase": "/".join([hu_path, "keyphrase", "keyphrase-en-scibert.pt"]), "negation-speculation": "/".join([ hu_path, "negation-speculation", "negation-speculation-model.pt" ]), # Biomedical models "hunflair-paper-cellline": "/".join([ hu_path, "hunflair_smallish_models", "cellline", "hunflair-celline-v1.0.pt", ]), "hunflair-paper-chemical": "/".join([ hu_path, "hunflair_smallish_models", "chemical", "hunflair-chemical-v1.0.pt", ]), "hunflair-paper-disease": "/".join([ hu_path, "hunflair_smallish_models", "disease", "hunflair-disease-v1.0.pt", ]), "hunflair-paper-gene": "/".join([ hu_path, "hunflair_smallish_models", "gene", "hunflair-gene-v1.0.pt" ]), "hunflair-paper-species": "/".join([ hu_path, "hunflair_smallish_models", "species", "hunflair-species-v1.0.pt", ]), "hunflair-cellline": "/".join([ hu_path, "hunflair_smallish_models", "cellline", "hunflair-celline-v1.0.pt", ]), "hunflair-chemical": "/".join([ hu_path, "hunflair_allcorpus_models", "huner-chemical", "hunflair-chemical-full-v1.0.pt", ]), "hunflair-disease": "/".join([ hu_path, "hunflair_allcorpus_models", "huner-disease", "hunflair-disease-full-v1.0.pt", ]), "hunflair-gene": "/".join([ hu_path, "hunflair_allcorpus_models", "huner-gene", "hunflair-gene-full-v1.0.pt", ]), "hunflair-species": "/".join([ hu_path, "hunflair_allcorpus_models", "huner-species", "hunflair-species-full-v1.1.pt", ]), } cache_dir = Path("models") get_from_model_hub = False # check if model name is a valid local file if Path(model_name).exists(): model_path = model_name # check if model key is remapped to HF key - if so, print out information elif model_name in huggingface_model_map: # get mapped name hf_model_name = huggingface_model_map[model_name] # use mapped name instead model_name = hf_model_name get_from_model_hub = True # if not, check if model key is remapped to direct download location. If so, download model elif model_name in hu_model_map: model_path = cached_path(hu_model_map[model_name], cache_dir=cache_dir) # special handling for the taggers by the @redewiegergabe project (TODO: move to model hub) elif model_name == "de-historic-indirect": model_file = flair.cache_root / cache_dir / "indirect" / "final-model.pt" if not model_file.exists(): cached_path( "http://www.redewiedergabe.de/models/indirect.zip", cache_dir=cache_dir, ) unzip_file( flair.cache_root / cache_dir / "indirect.zip", flair.cache_root / cache_dir, ) model_path = str(flair.cache_root / cache_dir / "indirect" / "final-model.pt") elif model_name == "de-historic-direct": model_file = flair.cache_root / cache_dir / "direct" / "final-model.pt" if not model_file.exists(): cached_path( "http://www.redewiedergabe.de/models/direct.zip", cache_dir=cache_dir, ) unzip_file( flair.cache_root / cache_dir / "direct.zip", flair.cache_root / cache_dir, ) model_path = str(flair.cache_root / cache_dir / "direct" / "final-model.pt") elif model_name == "de-historic-reported": model_file = flair.cache_root / cache_dir / "reported" / "final-model.pt" if not model_file.exists(): cached_path( "http://www.redewiedergabe.de/models/reported.zip", cache_dir=cache_dir, ) unzip_file( flair.cache_root / cache_dir / "reported.zip", flair.cache_root / cache_dir, ) model_path = str(flair.cache_root / cache_dir / "reported" / "final-model.pt") elif model_name == "de-historic-free-indirect": model_file = flair.cache_root / cache_dir / "freeIndirect" / "final-model.pt" if not model_file.exists(): cached_path( "http://www.redewiedergabe.de/models/freeIndirect.zip", cache_dir=cache_dir, ) unzip_file( flair.cache_root / cache_dir / "freeIndirect.zip", flair.cache_root / cache_dir, ) model_path = str(flair.cache_root / cache_dir / "freeIndirect" / "final-model.pt") # for all other cases (not local file or special download location), use HF model hub else: get_from_model_hub = True # if not a local file, get from model hub if get_from_model_hub: hf_model_name = "pytorch_model.bin" revision = "main" if "@" in model_name: model_name_split = model_name.split("@") revision = model_name_split[-1] model_name = model_name_split[0] # use model name as subfolder if "/" in model_name: model_folder = model_name.split("/", maxsplit=1)[1] else: model_folder = model_name # Lazy import from huggingface_hub import cached_download, hf_hub_url url = hf_hub_url(model_name, revision=revision, filename=hf_model_name) try: model_path = cached_download( url=url, library_name="flair", library_version=flair.__version__, cache_dir=flair.cache_root / "models" / model_folder, ) except HTTPError: # output information log.error("-" * 80) log.error( f"ACHTUNG: The key '{model_name}' was neither found on the ModelHub nor is this a valid path to a file on your system!" ) # log.error(f" - Error message: {e}") log.error( " -> Please check https://huggingface.co/models?filter=flair for all available models." ) log.error( " -> Alternatively, point to a model file on your local drive." ) log.error("-" * 80) Path(flair.cache_root / "models" / model_folder).rmdir() # remove folder again if not valid return model_path
def load(model: str): model_file = None aws_resource_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.2' if model.lower() == 'ner': base_path = '/'.join([aws_resource_path, 'NER-conll03--h256-l1-b32-%2Bglove%2Bnews-forward%2Bnews-backward--v0.2', 'en-ner-conll03-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'ner-fast': base_path = '/'.join([aws_resource_path, 'NER-conll03--h256-l1-b32-experimental--fast-v0.2', 'en-ner-fast-conll03-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'ner-ontonotes': base_path = '/'.join([aws_resource_path, 'NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward%2Bnews-backward--v0.2', 'en-ner-ontonotes-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'ner-ontonotes-fast': base_path = '/'.join([aws_resource_path, 'NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward-fast%2Bnews-backward-fast--v0.2', 'en-ner-ontonotes-fast-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'pos': base_path = '/'.join([aws_resource_path, 'POS-ontonotes--h256-l1-b32-%2Bmix-forward%2Bmix-backward--v0.2', 'en-pos-ontonotes-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'pos-fast': base_path = '/'.join([aws_resource_path, 'POS-ontonotes--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2', 'en-pos-ontonotes-fast-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'frame': base_path = '/'.join([aws_resource_path, 'FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward%2Bnews-backward--v0.2', 'en-frame-ontonotes-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'frame-fast': base_path = '/'.join([aws_resource_path, 'FRAME-conll12--h256-l1-b8-%2Bnews%2Bnews-forward-fast%2Bnews-backward-fast--v0.2', 'en-frame-ontonotes-fast-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'chunk': base_path = '/'.join([aws_resource_path, 'NP-conll2000--h256-l1-b32-%2Bnews-forward%2Bnews-backward--v0.2', 'en-chunk-conll2000-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'chunk-fast': base_path = '/'.join([aws_resource_path, 'NP-conll2000--h256-l1-b32-%2Bnews-forward-fast%2Bnews-backward-fast--v0.2', 'en-chunk-conll2000-fast-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'de-pos': base_path = '/'.join([aws_resource_path, 'UPOS-udgerman--h256-l1-b8-%2Bgerman-forward%2Bgerman-backward--v0.2', 'de-pos-ud-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'de-ner': base_path = '/'.join([aws_resource_path, 'NER-conll03ger--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2', 'de-ner-conll03-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model.lower() == 'de-ner-germeval': base_path = '/'.join([aws_resource_path, 'NER-germeval--h256-l1-b32-%2Bde-fasttext%2Bgerman-forward%2Bgerman-backward--v0.2', 'de-ner-germeval-v0.2.pt']) model_file = cached_path(base_path, cache_dir='models') if model_file is not None: tagger: SequenceTagger = SequenceTagger.load_from_file(model_file) return tagger