def interactive(lang_model_file): lm = LanguageModel.load_language_model(lang_model_file) import argparse parser = argparse.ArgumentParser(description='Argparse Test script') parser.add_argument("input", help='some parameter',default='This is a test') parser.add_argument("--temperature", help='some parameter',default=None,required=False) parser.add_argument("--seqlen", help='some parameter',default=None,required=False) seqlen, temperature = 20,1.0 while (1): inp = input('type input: > ') if inp == 'q' or inp == 'quit': break args,_ = parser.parse_known_args(shlex.split(inp)) seqlen = seqlen if args.seqlen is None else int(args.seqlen) temperature = temperature if args.temperature is None else float(args.temperature) inp = args.input text, likelihood = lm.generate_text(inp, number_of_characters=seqlen, temperature=temperature) print(text)
def test_train_language_model(results_base_path, resources_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(str(resources_path / 'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(str(results_base_path), sequence_length=10, mini_batch_size=10, max_epochs=2) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = CharLMEmbeddings(str(results_base_path / 'best-lm.pt')) sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) text = language_model.generate_text(100) assert (text is not None) assert (len(text) == 100) loaded_language_model = LanguageModel.load_language_model( str(results_base_path / 'best-lm.pt')) assert (loaded_language_model.best_score < 100) # clean up results directory shutil.rmtree(results_base_path, ignore_errors=True)
def __init__(self, model, detach: bool = True, use_cache: bool = True, cache_directory: str = None): """ initializes contextual string embeddings using a character-level language model. :param model: model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward' depending on which character language model is desired :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down training and often leads to worse results, so not recommended. :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will not allow re-use of once computed embeddings that do not fit into memory :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache is written to the provided directory. """ super().__init__() # news-english-forward if model.lower() == 'news-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-forward if model.lower() == 'news-forward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-forward if model.lower() == 'mix-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-backward if model.lower() == 'mix-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-forward if model.lower() == 'german-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-backward if model.lower() == 'german-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # common crawl Polish forward if model.lower() == 'polish-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt' model = cached_path(base_path, cache_dir='embeddings') # common crawl Polish backward if model.lower() == 'polish-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt' model = cached_path(base_path, cache_dir='embeddings') self.name = model self.static_embeddings = detach from flair.models import LanguageModel self.lm = LanguageModel.load_language_model(model) self.detach = detach self.is_forward_lm: bool = self.lm.is_forward_lm # caching variables self.use_cache: bool = use_cache self.cache = None self.cache_directory: str = cache_directory dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token('hello')) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
def __init__(self, model, detach: bool = True): super().__init__() """ Contextual string embeddings of words, as proposed in Akbik et al., 2018. Parameters ---------- arg1 : model model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward' depending on which character language model is desired arg2 : detach if set to false, the gradient will propagate into the language model. this dramatically slows down training and often leads to worse results, so not recommended. """ # news-english-forward if model.lower() == 'news-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-forward if model.lower() == 'news-forward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-forward if model.lower() == 'mix-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-backward if model.lower() == 'mix-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-forward if model.lower() == 'german-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-backward if model.lower() == 'german-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') self.name = model self.static_embeddings = detach from flair.models import LanguageModel self.lm = LanguageModel.load_language_model(model) self.detach = detach self.is_forward_lm: bool = self.lm.is_forward_lm dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token('hello')) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding())
parser.add_argument("--n_epochs", type=int, default=100) parser.add_argument("--n_chars", type=int, default=3000, help="number of generated characters") parser.add_argument("--ckpt_dir", type=str, default="checkpoints2") parser.add_argument("--output_dir", type=str, default="outputs2") hp = parser.parse_args() if not os.path.exists(hp.ckpt_dir): os.makedirs(hp.ckpt_dir) if not os.path.exists(hp.output_dir): os.makedirs(hp.output_dir) # device device = 'cuda' if torch.cuda.is_available() else 'cpu' print("# load existing language model") news_forward = FlairEmbeddings('news-forward') model = LanguageModel.load_language_model(news_forward) model.to(device) print("# load input data") item2idx = model.dictionary.item2idx print(item2idx["\n".encode()]) inputs = open('corpus/train/train.txt', 'r').read().splitlines()[-1] inputs = [item2idx.get(char.encode(), 0) for char in inputs] inputs = torch.LongTensor(inputs).unsqueeze(-1) # (seqlen, 1) inputs = inputs.to(device) print("# load corpus") corpus = TextCorpus(Path('corpus/'), model.dictionary, model.is_forward_lm,
def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512): """ initializes contextual string embeddings using a character-level language model. :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' depending on which character language model is desired. :param fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows down training and often leads to overfitting, so use with caution. :param chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster but requires more memory. Lower means slower but less memory. """ super().__init__() cache_dir = Path("embeddings") aws_path: str = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources" self.PRETRAINED_MODEL_ARCHIVE_MAP = { # multilingual models "multi-forward": f"{aws_path}/embeddings-v0.4.3/lm-jw300-forward-v0.1.pt", "multi-backward": f"{aws_path}/embeddings-v0.4.3/lm-jw300-backward-v0.1.pt", "multi-v0-forward": f"{aws_path}/embeddings-v0.4/lm-multi-forward-v0.1.pt", "multi-v0-backward": f"{aws_path}/embeddings-v0.4/lm-multi-backward-v0.1.pt", "multi-v0-forward-fast": f"{aws_path}/embeddings-v0.4/lm-multi-forward-fast-v0.1.pt", "multi-v0-backward-fast": f"{aws_path}/embeddings-v0.4/lm-multi-backward-fast-v0.1.pt", # English models "en-forward": f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt", "en-backward": f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt", "en-forward-fast": f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt", "en-backward-fast": f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt", "news-forward": f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt", "news-backward": f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt", "news-forward-fast": f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt", "news-backward-fast": f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt", "mix-forward": f"{aws_path}/embeddings/lm-mix-english-forward-v0.2rc.pt", "mix-backward": f"{aws_path}/embeddings/lm-mix-english-backward-v0.2rc.pt", # Arabic "ar-forward": f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-forward-v0.1.pt", "ar-backward": f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-backward-v0.1.pt", # Bulgarian "bg-forward-fast": f"{aws_path}/embeddings-v0.3/lm-bg-small-forward-v0.1.pt", "bg-backward-fast": f"{aws_path}/embeddings-v0.3/lm-bg-small-backward-v0.1.pt", "bg-forward": f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-forward-v0.1.pt", "bg-backward": f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-backward-v0.1.pt", # Czech "cs-forward": f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-forward-v0.1.pt", "cs-backward": f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-backward-v0.1.pt", "cs-v0-forward": f"{aws_path}/embeddings-v0.4/lm-cs-large-forward-v0.1.pt", "cs-v0-backward": f"{aws_path}/embeddings-v0.4/lm-cs-large-backward-v0.1.pt", # Danish "da-forward": f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-forward-v0.1.pt", "da-backward": f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-backward-v0.1.pt", # German "de-forward": f"{aws_path}/embeddings/lm-mix-german-forward-v0.2rc.pt", "de-backward": f"{aws_path}/embeddings/lm-mix-german-backward-v0.2rc.pt", "de-historic-ha-forward": f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-forward-v0.1.pt", "de-historic-ha-backward": f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-backward-v0.1.pt", "de-historic-wz-forward": f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-forward-v0.1.pt", "de-historic-wz-backward": f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-backward-v0.1.pt", # Spanish "es-forward": f"{aws_path}/embeddings-v0.4/language_model_es_forward_long/lm-es-forward.pt", "es-backward": f"{aws_path}/embeddings-v0.4/language_model_es_backward_long/lm-es-backward.pt", "es-forward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt", "es-backward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt", # Basque "eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.2.pt", "eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.2.pt", "eu-v1-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt", "eu-v1-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt", "eu-v0-forward": f"{aws_path}/embeddings-v0.4/lm-eu-large-forward-v0.1.pt", "eu-v0-backward": f"{aws_path}/embeddings-v0.4/lm-eu-large-backward-v0.1.pt", # Persian "fa-forward": f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-forward-v0.1.pt", "fa-backward": f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-backward-v0.1.pt", # Finnish "fi-forward": f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-forward-v0.1.pt", "fi-backward": f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-backward-v0.1.pt", # French "fr-forward": f"{aws_path}/embeddings/lm-fr-charlm-forward.pt", "fr-backward": f"{aws_path}/embeddings/lm-fr-charlm-backward.pt", # Hebrew "he-forward": f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-forward-v0.1.pt", "he-backward": f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-backward-v0.1.pt", # Hindi "hi-forward": f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-forward-v0.1.pt", "hi-backward": f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-backward-v0.1.pt", # Croatian "hr-forward": f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-forward-v0.1.pt", "hr-backward": f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-backward-v0.1.pt", # Indonesian "id-forward": f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-forward-v0.1.pt", "id-backward": f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-backward-v0.1.pt", # Italian "it-forward": f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-forward-v0.1.pt", "it-backward": f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-backward-v0.1.pt", # Japanese "ja-forward": f"{aws_path}/embeddings-v0.4.1/lm__char-forward__ja-wikipedia-3GB/japanese-forward.pt", "ja-backward": f"{aws_path}/embeddings-v0.4.1/lm__char-backward__ja-wikipedia-3GB/japanese-backward.pt", # Dutch "nl-forward": f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-forward-v0.1.pt", "nl-backward": f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-backward-v0.1.pt", "nl-v0-forward": f"{aws_path}/embeddings-v0.4/lm-nl-large-forward-v0.1.pt", "nl-v0-backward": f"{aws_path}/embeddings-v0.4/lm-nl-large-backward-v0.1.pt", # Norwegian "no-forward": f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-forward-v0.1.pt", "no-backward": f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-backward-v0.1.pt", # Polish "pl-forward": f"{aws_path}/embeddings/lm-polish-forward-v0.2.pt", "pl-backward": f"{aws_path}/embeddings/lm-polish-backward-v0.2.pt", "pl-opus-forward": f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-forward-v0.1.pt", "pl-opus-backward": f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-backward-v0.1.pt", # Portuguese "pt-forward": f"{aws_path}/embeddings-v0.4/lm-pt-forward.pt", "pt-backward": f"{aws_path}/embeddings-v0.4/lm-pt-backward.pt", # Pubmed "pubmed-forward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-fw-lm.pt", "pubmed-backward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-bw-lm.pt", # Slovenian "sl-forward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-forward-v0.1.pt", "sl-backward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-backward-v0.1.pt", "sl-v0-forward": f"{aws_path}/embeddings-v0.3/lm-sl-large-forward-v0.1.pt", "sl-v0-backward": f"{aws_path}/embeddings-v0.3/lm-sl-large-backward-v0.1.pt", # Swedish "sv-forward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-forward-v0.1.pt", "sv-backward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-backward-v0.1.pt", "sv-v0-forward": f"{aws_path}/embeddings-v0.4/lm-sv-large-forward-v0.1.pt", "sv-v0-backward": f"{aws_path}/embeddings-v0.4/lm-sv-large-backward-v0.1.pt", # Tamil "ta-forward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-forward-v0.1.pt", "ta-backward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-backward-v0.1.pt", } if type(model) == str: # load model if in pretrained model map if model.lower() in self.PRETRAINED_MODEL_ARCHIVE_MAP: base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[model.lower()] model = cached_path(base_path, cache_dir=cache_dir) elif replace_with_language_code( model) in self.PRETRAINED_MODEL_ARCHIVE_MAP: base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[ replace_with_language_code(model)] model = cached_path(base_path, cache_dir=cache_dir) elif not Path(model).exists(): raise ValueError( f'The given model "{model}" is not available or is not a valid path.' ) from flair.models import LanguageModel if type(model) == LanguageModel: self.lm: LanguageModel = model self.name = f"Task-LSTM-{self.lm.hidden_size}-{self.lm.nlayers}-{self.lm.is_forward_lm}" else: self.lm: LanguageModel = LanguageModel.load_language_model(model) self.name = str(model) # embeddings are static if we don't do finetuning self.fine_tune = fine_tune self.static_embeddings = not fine_tune self.is_forward_lm: bool = self.lm.is_forward_lm self.chars_per_chunk: int = chars_per_chunk # embed a dummy sentence to determine embedding_length dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding()) # set to eval mode self.eval()