def test_training(): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer( language_model, corpus) trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt') sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) print(sentence[1].embedding.size()) # clean up results directory shutil.rmtree('./results', ignore_errors=True)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2, checkpoint=True) trainer = LanguageModelTrainer.load_from_checkpoint( (results_base_path / u'checkpoint.pt'), corpus) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) shutil.rmtree(results_base_path)
def test_train_language_model(results_base_path, resources_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum', dictionary, language_model.is_forward_lm, character_level=True) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # use the character LM as embeddings to embed the example sentence 'I love Berlin' char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt')) sentence = Sentence('I love Berlin') char_lm_embeddings.embed(sentence) text, likelihood = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) # clean up results directory shutil.rmtree(results_base_path, ignore_errors=True)
def __init__(self, params: Dict) -> None: """Train a Language Model from scratch. This model can then be used as Flair embeddings. Args: params (dict): training config. """ self.checkpoint = params.get('checkpoint', True) self.sequence_length = params.get('seq_len', 250) self.mini_batch_size = params.get('batch_size', 100) self.learning_rate = params.get('lr', 20) self.patience = params.get('patience', 25) # forward LM predicts the next word, backward LM reads the sentence backwards and predicts the previous word. self.is_forward_lm = params.get('forward', True) self.corpus_dir = params.get('corpus_dir', '../') if not os.path.exists(self.corpus_dir): raise ValueError('Expected a corpus to train a language model.') # define corpus, dictionary and instantiate LM self.dictionary = Dictionary.load('chars') self.corpus = self._define_corpus() self.lm = self._define_model() self.save_dir = params.get('save_dir', '../')
def create_corpus(args, load_dict_from_lm=False, return_back='both'): if not load_dict_from_lm: dictionary: Dictionary = Dictionary.load( os.path.join(args.corpus_path, args.mapfile)) else: print("loading dictionary from finetune model") from flair.embeddings import FlairEmbeddings dictionary = FlairEmbeddings('he-forward').lm.dictionary language_model = LanguageModel(dictionary, args.is_forward_lm, hidden_size=args.hidden_size, nlayers=1) corpus = TextCorpus(args.corpus_path, dictionary, args.is_forward_lm, character_level=True) if return_back == 'both': return language_model, corpus elif return_back == 'language_model': return language_model elif return_back == 'corpus': return corpus else: print('Specified what to return back')
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): dictionary = Dictionary.load(u'chars') corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, forward=True, character_level=True) assert (corpus.test is not None) assert (corpus.train is not None) assert (corpus.valid is not None) assert (len(corpus.train) == 2)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): # get default dictionary dictionary: Dictionary = Dictionary.load('chars') # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum', dictionary, forward=True, character_level=True) assert (corpus.test is not None) assert (corpus.train_files is not None) assert (corpus.valid is not None) assert (len(corpus.train_files) == 2)
def test_fine_tunable_flair_embedding(): language_model_forward = LanguageModel(Dictionary.load( 'chars'), is_forward_lm=True, hidden_size=32, nlayers=1) embeddings = DocumentRNNEmbeddings([FlairEmbeddings( language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 128) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0) embeddings = DocumentLMEmbeddings( [FlairEmbeddings(language_model_forward, fine_tune=True)]) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 32) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
def test_train_resume_language_model_training(resources_path, results_base_path, tasks_base_path): # get default dictionary dictionary: Dictionary = Dictionary.load("chars") # init forward LM with 128 hidden states and 1 layer language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) # get the example corpus and process at character level in forward direction corpus: TextCorpus = TextCorpus( resources_path / "corpora/lorem_ipsum", dictionary, language_model.is_forward_lm, character_level=True, ) # train the language model trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train( results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2, checkpoint=True, ) del trainer, language_model trainer = LanguageModelTrainer.load_from_checkpoint( results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) # clean up results directory shutil.rmtree(results_base_path) del trainer
def process(options): """ Do the processing """ # are you training a forward or backward LM? is_forward_lm = not options.is_backward_lm # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(options.corpus_dir, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel( dictionary, is_forward_lm, hidden_size=2048, nlayers=1, embedding_size=100, # recommendations? dropout=0) # dropout probs? # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train( options. model_dir, # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'? sequence_length=250, learning_rate=20, mini_batch_size=100, anneal_factor=0.25, patience= 22, # 'patience' value of the learning rate scheduler: 1/2 training splits clip=0.25, # clipping gradients? max_epochs=75)
def create_local_polish_letters_dictionary_based_on_common(dictionary_name): dictionary = Dictionary.load(name="chars") dictionary.add_item('Ą') dictionary.add_item('ą') dictionary.add_item('Ć') dictionary.add_item('ć') dictionary.add_item('Ę') dictionary.add_item('ę') dictionary.add_item('Ł') dictionary.add_item('ł') dictionary.add_item('Ń') dictionary.add_item('ń') dictionary.add_item('Ó') dictionary.add_item('ó') dictionary.add_item('Ś') dictionary.add_item('ś') dictionary.add_item('Ź') dictionary.add_item('ź') dictionary.add_item('Ż') dictionary.add_item('ż') dictionary.save( use_scratch_dir_if_available('resources') + '/' + dictionary_name)
def test_train_language_model(results_base_path, resources_path): dictionary = Dictionary.load(u'chars') language_model = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1) corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'), dictionary, language_model.is_forward_lm, character_level=True) trainer = LanguageModelTrainer(language_model, corpus, test_mode=True) trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2) char_lm_embeddings = FlairEmbeddings( unicode((results_base_path / u'best-lm.pt'))) sentence = Sentence(u'I love Berlin') char_lm_embeddings.embed(sentence) (text, likelihood) = language_model.generate_text(number_of_characters=100) assert (text is not None) assert (len(text) >= 100) shutil.rmtree(results_base_path, ignore_errors=True)
from pathlib import Path from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus # are you training a forward or backward LM? ### NOTE: you have to train forward and backward separately ### is_forward_lm = True # load the default character dictionary dictionary: Dictionary = Dictionary.load('chars') # get your corpus, process forward and at the character level corpus = TextCorpus(Path('/local/kevinshih/BioFlair/data/PMC_Case_Rep/'), dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=2048, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=250, mini_batch_size=100,
from collections import Counter from flair.data import Dictionary # Flair Characters chars = Dictionary.load('common-chars') flair_characters = sorted([b.decode("utf-8") for b in chars.idx2item]) content = "\n".join(flair_characters) f = open("characters_flair.txt", "w") f.write(content) # Corpus Characters files = ["data1/train.txt", "data1/dev.txt", "data1/test.txt"] characters = Counter() for file in files: for line in open(file): if line.strip(): c = Counter(line.strip().split()[0]) characters += c corpus_characters = sorted([c for c, n in characters.most_common()]) content = "\n".join(corpus_characters) f = open("characters_corpus.txt", "w") f.write(content) # Merge characters characters = sorted(set(corpus_characters).union(set(flair_characters))) content = "\n".join(characters) f = open("characters_merged.txt", "w") f.write(content)
def __init__( self, embeddings: flair.embeddings.TokenEmbeddings = None, label_type: str = "lemma", rnn_input_size: int = 50, rnn_hidden_size: int = 256, rnn_layers: int = 2, encode_characters: bool = True, char_dict: Union[str, Dictionary] = "common-chars-lemmatizer", max_sequence_length_dependent_on_input: bool = True, max_sequence_length: int = 20, use_attention: bool = True, beam_size: int = 1, start_symbol_for_encoding: bool = True, end_symbol_for_encoding: bool = True, bidirectional_encoding: bool = True, ): """ Initializes a Lemmatizer model The model consists of a decoder and an encoder. The encoder is either a RNN-cell (torch.nn.GRU) or a Token-Embedding from flair if an embedding is handed to the constructor (token_embedding). The output of the encoder is used as the initial hidden state to the decoder, which is an RNN-cell (GRU) that predicts the lemma of the given token one letter at a time. Note that one can use data in which only those words are annotated that differ from their lemma or data in which all words are annotated with a (maybe equal) lemma. :param embeddings: Embedding used to encode sentence :param rnn_input_size: Input size of the RNN('s). Each letter of a token is represented by a hot-one-vector over the given character dictionary. This vector is transformed to a input_size vector with a linear layer. :param rnn_hidden_size: size of the hidden state of the RNN('s). :param rnn_layers: Number of stacked RNN cells :param beam_size: Number of hypothesis used when decoding the output of the RNN. Only used in prediction. :param char_dict: Dictionary of characters the model is able to process. The dictionary must contain <unk> for the handling of unknown characters. If None, a standard dictionary will be loaded. One can either hand over a path to a dictionary or the dictionary itself. :param label_type: Name of the gold labels to use. :param max_sequence_length_dependent_on_input: If set to True, the maximum length of a decoded sequence in the prediction depends on the sentences you want to lemmatize. To be precise the maximum length is computed as the length of the longest token in the sentences plus one. :param max_sequence_length: If set to True and max_sequence_length_dependend_on_input is False a fixed maximum length for the decoding will be used for all sentences. :param use_attention: whether or not to use attention. Only sensible if encoding via RNN """ super().__init__() self._label_type = label_type self.beam_size = beam_size self.max_sequence_length = max_sequence_length self.dependent_on_input = max_sequence_length_dependent_on_input self.start_symbol = start_symbol_for_encoding self.end_symbol = end_symbol_for_encoding self.bi_encoding = bidirectional_encoding self.rnn_hidden_size = rnn_hidden_size # whether to encode characters and whether to use attention (attention can only be used if chars are encoded) self.encode_characters = encode_characters self.use_attention = use_attention if not self.encode_characters: self.use_attention = False # character dictionary for decoding and encoding self.char_dictionary = char_dict if isinstance( char_dict, Dictionary) else Dictionary.load(char_dict) # make sure <unk> is in dictionary for handling of unknown characters if not self.char_dictionary.add_unk: raise KeyError("<unk> must be contained in char_dict") # add special symbols to dictionary if necessary and save respective indices self.dummy_index = self.char_dictionary.add_item("<>") self.start_index = self.char_dictionary.add_item("<S>") self.end_index = self.char_dictionary.add_item("<E>") # ---- ENCODER ---- # encoder character embeddings self.encoder_character_embedding = nn.Embedding( len(self.char_dictionary), rnn_input_size) # encoder pre-trained embeddings self.encoder_embeddings = embeddings hidden_input_size = 0 if embeddings: hidden_input_size += embeddings.embedding_length if encode_characters: hidden_input_size += rnn_hidden_size if encode_characters and bidirectional_encoding: hidden_input_size += rnn_hidden_size self.emb_to_hidden = nn.Linear(hidden_input_size, rnn_hidden_size) # encoder RNN self.encoder_rnn = nn.GRU( input_size=rnn_input_size, hidden_size=self.rnn_hidden_size, batch_first=True, num_layers=rnn_layers, bidirectional=self.bi_encoding, ) # additional encoder linear layer if bidirectional encoding if self.bi_encoding: self.bi_hidden_states_to_hidden_size: Optional[ nn.Linear] = nn.Linear(2 * self.rnn_hidden_size, self.rnn_hidden_size, bias=False) else: self.bi_hidden_states_to_hidden_size = None # ---- DECODER ---- # decoder: linear layers to transform vectors to and from alphabet_size self.decoder_character_embedding = nn.Embedding( len(self.char_dictionary), rnn_input_size) # when using attention we concatenate attention outcome and decoder hidden states self.character_decoder = nn.Linear( 2 * self.rnn_hidden_size if self.use_attention else self.rnn_hidden_size, len(self.char_dictionary), ) # decoder RNN self.rnn_input_size = rnn_input_size self.rnn_layers = rnn_layers self.decoder_rnn = nn.GRU( input_size=rnn_input_size, hidden_size=self.rnn_hidden_size, batch_first=True, num_layers=rnn_layers, ) # loss and softmax self.loss = nn.CrossEntropyLoss(reduction="sum") # self.unreduced_loss = nn.CrossEntropyLoss(reduction='none') # for prediction self.softmax = nn.Softmax(dim=2) self.to(flair.device)
def __init__(self, charPath, is_forward=True): self.is_forward_lm = is_forward self.dictionary: Dictionary = Dictionary.load(charPath)