def test_training(): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer( language_model, corpus) trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt') sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) print(sentence[1].embedding.size()) # clean up results directory shutil.rmtree('./results', ignore_errors=True)
def create_corpus(args, load_dict_from_lm=False, return_back='both'): if not load_dict_from_lm: dictionary: Dictionary = Dictionary.load( os.path.join(args.corpus_path, args.mapfile)) else: print("loading dictionary from finetune model") from flair.embeddings import FlairEmbeddings dictionary = FlairEmbeddings('he-forward').lm.dictionary language_model = LanguageModel(dictionary, args.is_forward_lm, hidden_size=args.hidden_size, nlayers=1) corpus = TextCorpus(args.corpus_path, dictionary, args.is_forward_lm, character_level=True) if return_back == 'both': return language_model, corpus elif return_back == 'language_model': return language_model elif return_back == 'corpus': return corpus else: print('Specified what to return back')
def test_train_language_model(results_base_path, resources_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt')) sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) text, likelihood = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) # clean up results directory shutil.rmtree(results_base_path, ignore_errors=True)
def train_LM(file_path, model_path, is_forward_lm=True): from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus dictionary = Dictionary.load_from_file(file_path + 'mappings') # get your corpus, process forward and at the character level corpus = TextCorpus(file_path, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train(model_path, sequence_length=100, mini_batch_size=32, max_epochs=10)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2, checkpoint=True) trainer = LanguageModelTrainer.load_from_checkpoint( (results_base_path / u'checkpoint.pt'), corpus) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) shutil.rmtree(results_base_path)
def load_from_checkpoint(checkpoint_file, corpus, optimizer=SGD): checkpoint = LanguageModel.load_checkpoint(checkpoint_file) return LanguageModelTrainer( checkpoint['model'], corpus, optimizer, epoch=checkpoint['epoch'], split=checkpoint['split'], loss=checkpoint['loss'], optimizer_state=checkpoint['optimizer_state_dict'])
def load_from_checkpoint( checkpoint_file: Path, corpus: TextCorpus, optimizer: Optimizer = SGD ): checkpoint = LanguageModel.load_checkpoint(checkpoint_file) return LanguageModelTrainer( checkpoint["model"], corpus, optimizer, epoch=checkpoint["epoch"], split=checkpoint["split"], loss=checkpoint["loss"], optimizer_state=checkpoint["optimizer_state_dict"], )
def test_train_language_model(results_base_path, resources_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) char_lm_embeddings = FlairEmbeddings( unicode((results_base_path / u'best-lm.pt'))) sentence = Sentence(u'I love Berlin') char_lm_embeddings.embed(sentence) (text, likelihood) = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) shutil.rmtree(results_base_path, ignore_errors=True)
def load_checkpoint(checkpoint_file: Union[str, Path], corpus: TextCorpus, optimizer: Optimizer = SGD): if type(checkpoint_file) is str: checkpoint_file = Path(checkpoint_file) checkpoint = LanguageModel.load_checkpoint(checkpoint_file) return LanguageModelTrainer( checkpoint["model"], corpus, optimizer, epoch=checkpoint["epoch"], split=checkpoint["split"], loss=checkpoint["loss"], optimizer_state=checkpoint["optimizer_state_dict"], )
def trainLanguage(self, corpusPath): self.corpus = TextCorpus(Path(corpusPath), self.dictionary, self.is_forward_lm, character_level=True) self.language_model = LanguageModel(self.dictionary, self.is_forward_lm, hidden_size=128, nlayers=10) self.trainer = LanguageModelTrainer(self.language_model, self.corpus) self.trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)
def test_fine_tunable_flair_embedding(): language_model_forward = LanguageModel(Dictionary.load( 'chars'), is_forward_lm=True, hidden_size=32, nlayers=1) embeddings = DocumentRNNEmbeddings([FlairEmbeddings( language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 128) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0) embeddings = DocumentLMEmbeddings( [FlairEmbeddings(language_model_forward, fine_tune=True)]) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 32) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): # get default dictionary dictionary: Dictionary = Dictionary.load("chars") # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus( resources_path / "corpora/lorem_ipsum", dictionary, language_model.is_forward_lm, character_level=True, ) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train( results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2, checkpoint=True, ) del trainer, language_model trainer = LanguageModelTrainer.load_from_checkpoint( results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # clean up results directory shutil.rmtree(results_base_path) del trainer
def interactive(lang_model_file): lm = LanguageModel.load_language_model(lang_model_file) import argparse parser = argparse.ArgumentParser(description='Argparse Test script') parser.add_argument("input", help='some parameter',default='This is a test') parser.add_argument("--temperature", help='some parameter',default=None,required=False) parser.add_argument("--seqlen", help='some parameter',default=None,required=False) seqlen, temperature = 20,1.0 while (1): inp = input('type input: > ') if inp == 'q' or inp == 'quit': break args,_ = parser.parse_known_args(shlex.split(inp)) seqlen = seqlen if args.seqlen is None else int(args.seqlen) temperature = temperature if args.temperature is None else float(args.temperature) inp = args.input text, likelihood = lm.generate_text(inp, number_of_characters=seqlen, temperature=temperature) print(text)
def process(options): """ Do the processing """ # are you training a forward or backward LM? is_forward_lm = not options.is_backward_lm # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(options.corpus_dir, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel( dictionary, is_forward_lm, hidden_size=2048, nlayers=1, embedding_size=100, # recommendations? dropout=0) # dropout probs? # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train( options. model_dir, # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'? sequence_length=250, learning_rate=20, mini_batch_size=100, anneal_factor=0.25, patience= 22, # 'patience' value of the learning rate scheduler: 1/2 training splits clip=0.25, # clipping gradients? max_epochs=75)
def __init__(self, model, detach: bool = True, use_cache: bool = True, cache_directory: str = None): """ initializes contextual string embeddings using a character-level language model. :param model: model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward' depending on which character language model is desired :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down training and often leads to worse results, so not recommended. :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will not allow re-use of once computed embeddings that do not fit into memory :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache is written to the provided directory. """ super().__init__() # news-english-forward if model.lower() == 'news-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-forward if model.lower() == 'news-forward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-forward if model.lower() == 'mix-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-backward if model.lower() == 'mix-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-forward if model.lower() == 'german-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-backward if model.lower() == 'german-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # common crawl Polish forward if model.lower() == 'polish-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt' model = cached_path(base_path, cache_dir='embeddings') # common crawl Polish backward if model.lower() == 'polish-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt' model = cached_path(base_path, cache_dir='embeddings') self.name = model self.static_embeddings = detach from flair.models import LanguageModel self.lm = LanguageModel.load_language_model(model) self.detach = detach self.is_forward_lm: bool = self.lm.is_forward_lm # caching variables self.use_cache: bool = use_cache self.cache = None self.cache_directory: str = cache_directory dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token('hello')) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
def __init__(self, model, detach: bool = True): super().__init__() """ Contextual string embeddings of words, as proposed in Akbik et al., 2018. Parameters ---------- arg1 : model model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward' depending on which character language model is desired arg2 : detach if set to false, the gradient will propagate into the language model. this dramatically slows down training and often leads to worse results, so not recommended. """ # news-english-forward if model.lower() == 'news-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-forward if model.lower() == 'news-forward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-forward if model.lower() == 'mix-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-backward if model.lower() == 'mix-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-forward if model.lower() == 'german-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-backward if model.lower() == 'german-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') self.name = model self.static_embeddings = detach from flair.models import LanguageModel self.lm = LanguageModel.load_language_model(model) self.detach = detach self.is_forward_lm: bool = self.lm.is_forward_lm dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token('hello')) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding())
def _define_model(self) -> LanguageModel: return LanguageModel(self.dictionary, self.is_forward_lm, hidden_size=1024, nlayers=1)
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # are you training a forward or backward LM? ### NOTE: you have to train forward and backward separately ### is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(Path('/local/kevinshih/BioFlair/data/PMC_Case_Rep/'), dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=250, mini_batch_size=100, max_epochs=50)
## get your corpus, process forward and at the character level prepare_mesinesp_for_flair_embeds_training( ) # prepare raw text from Spanish PubMed Abstracts for training mesinesp_subset = sys.argv[1] corpus_path = "./data/datasets/mesinesp/" + str(mesinesp_subset) + "/" corpus = TextCorpus(corpus_path, dictionary, is_forward_lm, character_level=True) ## instantiate your language model, set hidden size and number of layers (hidden_size=1024-small model, (hidden_size=2048-large model) language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=1024, nlayers=1, dropout=0.1) ## train your language model trainer = LanguageModelTrainer(language_model, corpus) #trainer.num_workers = 4 #Flair auto-detects whether you have a GPU available. If there is a GPU, it will automatically run training there. output_dir = str() if is_forward_lm: if not os.path.exists('./trained_embeddings/' + str(mesinesp_subset) + '/fwd/'): os.makedirs('./trained_embeddings/' + str(mesinesp_subset) + '/fwd/')
parser.add_argument("--n_epochs", type=int, default=100) parser.add_argument("--n_chars", type=int, default=3000, help="number of generated characters") parser.add_argument("--ckpt_dir", type=str, default="checkpoints2") parser.add_argument("--output_dir", type=str, default="outputs2") hp = parser.parse_args() if not os.path.exists(hp.ckpt_dir): os.makedirs(hp.ckpt_dir) if not os.path.exists(hp.output_dir): os.makedirs(hp.output_dir) # device device = 'cuda' if torch.cuda.is_available() else 'cpu' print("# load existing language model") news_forward = FlairEmbeddings('news-forward') model = LanguageModel.load_language_model(news_forward) model.to(device) print("# load input data") item2idx = model.dictionary.item2idx print(item2idx["\n".encode()]) inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1] inputs = [item2idx.get(char.encode(), 0) for char in inputs] inputs = torch.LongTensor(inputs).unsqueeze(-1) # (seqlen, 1) inputs = inputs.to(device) print("# load corpus") corpus = TextCorpus(Path('corpus/'), model.dictionary, model.is_forward_lm,
# TODO: add possibility for other dictionary! # (https://github.com/zalandoresearch/flair/issues/179#issuecomment-433942853) print("loading Dictionary") dictionary = Dictionary.load('chars') # instantiate corpus log.info("Making corpus from folder: {}".format(args.corpus_path)) corpus = TextCorpus(args.corpus_path, dictionary, options['is_forward_lm'], **options['corpus']) # TRAINING if args.continue_training: # load checkpoint cp_path = args.train_path + '/checkpoint.pt' log.info("Continue training from {}".format(cp_path)) # load LM-Trainer trainer = LanguageModelTrainer.load_from_checkpoint(cp_path, corpus) else: # instantiate language model log.info("Creating language model") language_model = LanguageModel(dictionary, options['is_forward_lm'], **options['language_model']) # instantiate LM Trainer trainer = LanguageModelTrainer(language_model, corpus) log.info("Starting training. See {}".format(args.train_path)) trainer.log_interval = 500 trainer.train(args.train_path, **options['training'])
def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512): """ initializes contextual string embeddings using a character-level language model. :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' depending on which character language model is desired. :param fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows down training and often leads to overfitting, so use with caution. :param chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster but requires more memory. Lower means slower but less memory. """ super().__init__() cache_dir = Path("embeddings") aws_path: str = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources" self.PRETRAINED_MODEL_ARCHIVE_MAP = { # multilingual models "multi-forward": f"{aws_path}/embeddings-v0.4.3/lm-jw300-forward-v0.1.pt", "multi-backward": f"{aws_path}/embeddings-v0.4.3/lm-jw300-backward-v0.1.pt", "multi-v0-forward": f"{aws_path}/embeddings-v0.4/lm-multi-forward-v0.1.pt", "multi-v0-backward": f"{aws_path}/embeddings-v0.4/lm-multi-backward-v0.1.pt", "multi-v0-forward-fast": f"{aws_path}/embeddings-v0.4/lm-multi-forward-fast-v0.1.pt", "multi-v0-backward-fast": f"{aws_path}/embeddings-v0.4/lm-multi-backward-fast-v0.1.pt", # English models "en-forward": f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt", "en-backward": f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt", "en-forward-fast": f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt", "en-backward-fast": f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt", "news-forward": f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt", "news-backward": f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt", "news-forward-fast": f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt", "news-backward-fast": f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt", "mix-forward": f"{aws_path}/embeddings/lm-mix-english-forward-v0.2rc.pt", "mix-backward": f"{aws_path}/embeddings/lm-mix-english-backward-v0.2rc.pt", # Arabic "ar-forward": f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-forward-v0.1.pt", "ar-backward": f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-backward-v0.1.pt", # Bulgarian "bg-forward-fast": f"{aws_path}/embeddings-v0.3/lm-bg-small-forward-v0.1.pt", "bg-backward-fast": f"{aws_path}/embeddings-v0.3/lm-bg-small-backward-v0.1.pt", "bg-forward": f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-forward-v0.1.pt", "bg-backward": f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-backward-v0.1.pt", # Czech "cs-forward": f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-forward-v0.1.pt", "cs-backward": f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-backward-v0.1.pt", "cs-v0-forward": f"{aws_path}/embeddings-v0.4/lm-cs-large-forward-v0.1.pt", "cs-v0-backward": f"{aws_path}/embeddings-v0.4/lm-cs-large-backward-v0.1.pt", # Danish "da-forward": f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-forward-v0.1.pt", "da-backward": f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-backward-v0.1.pt", # German "de-forward": f"{aws_path}/embeddings/lm-mix-german-forward-v0.2rc.pt", "de-backward": f"{aws_path}/embeddings/lm-mix-german-backward-v0.2rc.pt", "de-historic-ha-forward": f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-forward-v0.1.pt", "de-historic-ha-backward": f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-backward-v0.1.pt", "de-historic-wz-forward": f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-forward-v0.1.pt", "de-historic-wz-backward": f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-backward-v0.1.pt", # Spanish "es-forward": f"{aws_path}/embeddings-v0.4/language_model_es_forward_long/lm-es-forward.pt", "es-backward": f"{aws_path}/embeddings-v0.4/language_model_es_backward_long/lm-es-backward.pt", "es-forward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt", "es-backward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt", # Basque "eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.2.pt", "eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.2.pt", "eu-v1-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt", "eu-v1-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt", "eu-v0-forward": f"{aws_path}/embeddings-v0.4/lm-eu-large-forward-v0.1.pt", "eu-v0-backward": f"{aws_path}/embeddings-v0.4/lm-eu-large-backward-v0.1.pt", # Persian "fa-forward": f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-forward-v0.1.pt", "fa-backward": f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-backward-v0.1.pt", # Finnish "fi-forward": f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-forward-v0.1.pt", "fi-backward": f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-backward-v0.1.pt", # French "fr-forward": f"{aws_path}/embeddings/lm-fr-charlm-forward.pt", "fr-backward": f"{aws_path}/embeddings/lm-fr-charlm-backward.pt", # Hebrew "he-forward": f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-forward-v0.1.pt", "he-backward": f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-backward-v0.1.pt", # Hindi "hi-forward": f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-forward-v0.1.pt", "hi-backward": f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-backward-v0.1.pt", # Croatian "hr-forward": f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-forward-v0.1.pt", "hr-backward": f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-backward-v0.1.pt", # Indonesian "id-forward": f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-forward-v0.1.pt", "id-backward": f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-backward-v0.1.pt", # Italian "it-forward": f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-forward-v0.1.pt", "it-backward": f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-backward-v0.1.pt", # Japanese "ja-forward": f"{aws_path}/embeddings-v0.4.1/lm__char-forward__ja-wikipedia-3GB/japanese-forward.pt", "ja-backward": f"{aws_path}/embeddings-v0.4.1/lm__char-backward__ja-wikipedia-3GB/japanese-backward.pt", # Dutch "nl-forward": f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-forward-v0.1.pt", "nl-backward": f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-backward-v0.1.pt", "nl-v0-forward": f"{aws_path}/embeddings-v0.4/lm-nl-large-forward-v0.1.pt", "nl-v0-backward": f"{aws_path}/embeddings-v0.4/lm-nl-large-backward-v0.1.pt", # Norwegian "no-forward": f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-forward-v0.1.pt", "no-backward": f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-backward-v0.1.pt", # Polish "pl-forward": f"{aws_path}/embeddings/lm-polish-forward-v0.2.pt", "pl-backward": f"{aws_path}/embeddings/lm-polish-backward-v0.2.pt", "pl-opus-forward": f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-forward-v0.1.pt", "pl-opus-backward": f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-backward-v0.1.pt", # Portuguese "pt-forward": f"{aws_path}/embeddings-v0.4/lm-pt-forward.pt", "pt-backward": f"{aws_path}/embeddings-v0.4/lm-pt-backward.pt", # Pubmed "pubmed-forward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-fw-lm.pt", "pubmed-backward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-bw-lm.pt", # Slovenian "sl-forward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-forward-v0.1.pt", "sl-backward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-backward-v0.1.pt", "sl-v0-forward": f"{aws_path}/embeddings-v0.3/lm-sl-large-forward-v0.1.pt", "sl-v0-backward": f"{aws_path}/embeddings-v0.3/lm-sl-large-backward-v0.1.pt", # Swedish "sv-forward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-forward-v0.1.pt", "sv-backward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-backward-v0.1.pt", "sv-v0-forward": f"{aws_path}/embeddings-v0.4/lm-sv-large-forward-v0.1.pt", "sv-v0-backward": f"{aws_path}/embeddings-v0.4/lm-sv-large-backward-v0.1.pt", # Tamil "ta-forward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-forward-v0.1.pt", "ta-backward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-backward-v0.1.pt", } if type(model) == str: # load model if in pretrained model map if model.lower() in self.PRETRAINED_MODEL_ARCHIVE_MAP: base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[model.lower()] model = cached_path(base_path, cache_dir=cache_dir) elif replace_with_language_code( model) in self.PRETRAINED_MODEL_ARCHIVE_MAP: base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[ replace_with_language_code(model)] model = cached_path(base_path, cache_dir=cache_dir) elif not Path(model).exists(): raise ValueError( f'The given model "{model}" is not available or is not a valid path.' ) from flair.models import LanguageModel if type(model) == LanguageModel: self.lm: LanguageModel = model self.name = f"Task-LSTM-{self.lm.hidden_size}-{self.lm.nlayers}-{self.lm.is_forward_lm}" else: self.lm: LanguageModel = LanguageModel.load_language_model(model) self.name = str(model) # embeddings are static if we don't do finetuning self.fine_tune = fine_tune self.static_embeddings = not fine_tune self.is_forward_lm: bool = self.lm.is_forward_lm self.chars_per_chunk: int = chars_per_chunk # embed a dummy sentence to determine embedding_length dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding()) # set to eval mode self.eval()