Esempio n. 1
0
def test_training():
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(
        language_model, corpus)
    trainer.train('./results',
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=5)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)
    print(sentence[1].embedding.size())

    # clean up results directory
    shutil.rmtree('./results', ignore_errors=True)
Esempio n. 2
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2,
                  checkpoint=True)
    trainer = LanguageModelTrainer.load_from_checkpoint(
        (results_base_path / u'checkpoint.pt'), corpus)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    shutil.rmtree(results_base_path)
def test_train_language_model(results_base_path, resources_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt'))
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)

    text, likelihood = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)

    # clean up results directory
    shutil.rmtree(results_base_path, ignore_errors=True)
Esempio n. 4
0
    def __init__(self, params: Dict) -> None:
        """Train a Language Model from scratch. This model can then be used as Flair embeddings.

        Args:
            params (dict): training config.
        """
        self.checkpoint = params.get('checkpoint', True)
        self.sequence_length = params.get('seq_len', 250)
        self.mini_batch_size = params.get('batch_size', 100)
        self.learning_rate = params.get('lr', 20)
        self.patience = params.get('patience', 25)

        # forward LM predicts the next word, backward LM reads the sentence backwards and predicts the previous word.
        self.is_forward_lm = params.get('forward', True)

        self.corpus_dir = params.get('corpus_dir', '../')
        if not os.path.exists(self.corpus_dir):
            raise ValueError('Expected a corpus to train a language model.')

        # define corpus, dictionary and instantiate LM
        self.dictionary = Dictionary.load('chars')
        self.corpus = self._define_corpus()
        self.lm = self._define_model()

        self.save_dir = params.get('save_dir', '../')
Esempio n. 5
0
def create_corpus(args, load_dict_from_lm=False, return_back='both'):
    if not load_dict_from_lm:
        dictionary: Dictionary = Dictionary.load(
            os.path.join(args.corpus_path, args.mapfile))

    else:
        print("loading dictionary from finetune model")
        from flair.embeddings import FlairEmbeddings
        dictionary = FlairEmbeddings('he-forward').lm.dictionary

    language_model = LanguageModel(dictionary,
                                   args.is_forward_lm,
                                   hidden_size=args.hidden_size,
                                   nlayers=1)

    corpus = TextCorpus(args.corpus_path,
                        dictionary,
                        args.is_forward_lm,
                        character_level=True)
    if return_back == 'both':
        return language_model, corpus
    elif return_back == 'language_model':
        return language_model
    elif return_back == 'corpus':
        return corpus
    else:
        print('Specified what to return back')
Esempio n. 6
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    dictionary = Dictionary.load(u'chars')
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        forward=True,
                        character_level=True)
    assert (corpus.test is not None)
    assert (corpus.train is not None)
    assert (corpus.valid is not None)
    assert (len(corpus.train) == 2)
Esempio n. 7
0
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum',
                                    dictionary,
                                    forward=True,
                                    character_level=True)

    assert (corpus.test is not None)
    assert (corpus.train_files is not None)
    assert (corpus.valid is not None)
    assert (len(corpus.train_files) == 2)
Esempio n. 8
0
def test_fine_tunable_flair_embedding():
    language_model_forward = LanguageModel(Dictionary.load(
        'chars'), is_forward_lm=True, hidden_size=32, nlayers=1)
    embeddings = DocumentRNNEmbeddings([FlairEmbeddings(
        language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False)
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 128)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
    embeddings = DocumentLMEmbeddings(
        [FlairEmbeddings(language_model_forward, fine_tune=True)])
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 32)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
def test_train_resume_language_model_training(resources_path,
                                              results_base_path,
                                              tasks_base_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load("chars")

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(
        resources_path / "corpora/lorem_ipsum",
        dictionary,
        language_model.is_forward_lm,
        character_level=True,
    )

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model,
                                                         corpus,
                                                         test_mode=True)
    trainer.train(
        results_base_path,
        sequence_length=10,
        mini_batch_size=10,
        max_epochs=2,
        checkpoint=True,
    )
    del trainer, language_model

    trainer = LanguageModelTrainer.load_from_checkpoint(
        results_base_path / "checkpoint.pt", corpus)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del trainer
Esempio n. 10
0
def process(options):
    """
    Do the processing
    """

    # are you training a forward or backward LM?
    is_forward_lm = not options.is_backward_lm

    # load the default character dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(options.corpus_dir,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(
        dictionary,
        is_forward_lm,
        hidden_size=2048,
        nlayers=1,
        embedding_size=100,  # recommendations?
        dropout=0)  # dropout probs?

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(
        options.
        model_dir,  # embeddings_in_memory=False: effect on 'RuntimeError: CUDA out of memory'?
        sequence_length=250,
        learning_rate=20,
        mini_batch_size=100,
        anneal_factor=0.25,
        patience=
        22,  # 'patience' value of the learning rate scheduler: 1/2 training splits
        clip=0.25,  # clipping gradients?
        max_epochs=75)
Esempio n. 11
0
def create_local_polish_letters_dictionary_based_on_common(dictionary_name):
    dictionary = Dictionary.load(name="chars")
    dictionary.add_item('Ą')
    dictionary.add_item('ą')
    dictionary.add_item('Ć')
    dictionary.add_item('ć')
    dictionary.add_item('Ę')
    dictionary.add_item('ę')
    dictionary.add_item('Ł')
    dictionary.add_item('ł')
    dictionary.add_item('Ń')
    dictionary.add_item('ń')
    dictionary.add_item('Ó')
    dictionary.add_item('ó')
    dictionary.add_item('Ś')
    dictionary.add_item('ś')
    dictionary.add_item('Ź')
    dictionary.add_item('ź')
    dictionary.add_item('Ż')
    dictionary.add_item('ż')
    dictionary.save(
        use_scratch_dir_if_available('resources') + '/' + dictionary_name)
Esempio n. 12
0
def test_train_language_model(results_base_path, resources_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    char_lm_embeddings = FlairEmbeddings(
        unicode((results_base_path / u'best-lm.pt')))
    sentence = Sentence(u'I love Berlin')
    char_lm_embeddings.embed(sentence)
    (text, likelihood) = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)
    shutil.rmtree(results_base_path, ignore_errors=True)
Esempio n. 13
0
from pathlib import Path

from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# are you training a forward or backward LM?
### NOTE: you have to train forward and backward separately ###
is_forward_lm = True

# load the default character dictionary
dictionary: Dictionary = Dictionary.load('chars')

# get your corpus, process forward and at the character level
corpus = TextCorpus(Path('/local/kevinshih/BioFlair/data/PMC_Case_Rep/'),
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=2048,
                               nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model',
              sequence_length=250,
              mini_batch_size=100,
Esempio n. 14
0
from collections import Counter

from flair.data import Dictionary

# Flair Characters
chars = Dictionary.load('common-chars')
flair_characters = sorted([b.decode("utf-8") for b in chars.idx2item])
content = "\n".join(flair_characters)
f = open("characters_flair.txt", "w")
f.write(content)

# Corpus Characters
files = ["data1/train.txt", "data1/dev.txt", "data1/test.txt"]

characters = Counter()
for file in files:
    for line in open(file):
        if line.strip():
            c = Counter(line.strip().split()[0])
            characters += c
corpus_characters = sorted([c for c, n in characters.most_common()])
content = "\n".join(corpus_characters)
f = open("characters_corpus.txt", "w")
f.write(content)

# Merge characters
characters = sorted(set(corpus_characters).union(set(flair_characters)))
content = "\n".join(characters)
f = open("characters_merged.txt", "w")
f.write(content)
Esempio n. 15
0
    def __init__(
        self,
        embeddings: flair.embeddings.TokenEmbeddings = None,
        label_type: str = "lemma",
        rnn_input_size: int = 50,
        rnn_hidden_size: int = 256,
        rnn_layers: int = 2,
        encode_characters: bool = True,
        char_dict: Union[str, Dictionary] = "common-chars-lemmatizer",
        max_sequence_length_dependent_on_input: bool = True,
        max_sequence_length: int = 20,
        use_attention: bool = True,
        beam_size: int = 1,
        start_symbol_for_encoding: bool = True,
        end_symbol_for_encoding: bool = True,
        bidirectional_encoding: bool = True,
    ):
        """
        Initializes a Lemmatizer model
        The model consists of a decoder and an encoder. The encoder is either a RNN-cell (torch.nn.GRU)
        or a Token-Embedding from flair if an embedding is handed to the constructor (token_embedding).
        The output of the encoder is used as the initial hidden state to the decoder, which is an RNN-cell (GRU)
        that predicts the lemma of the given token one letter at a time.
        Note that one can use data in which only those words are annotated that differ from their lemma or data
        in which all words are annotated with a (maybe equal) lemma.
        :param embeddings: Embedding used to encode sentence
        :param rnn_input_size: Input size of the RNN('s). Each letter of a token is represented by a hot-one-vector
            over the given character dictionary. This vector is transformed to a input_size vector with a linear layer.
        :param rnn_hidden_size: size of the hidden state of the RNN('s).
        :param rnn_layers: Number of stacked RNN cells
        :param beam_size: Number of hypothesis used when decoding the output of the RNN. Only used in prediction.
        :param char_dict: Dictionary of characters the model is able to process. The dictionary must contain <unk> for
            the handling of unknown characters. If None, a standard dictionary will be loaded. One can either hand
            over a path to a dictionary or the dictionary itself.
        :param label_type: Name of the gold labels to use.
        :param max_sequence_length_dependent_on_input: If set to True, the maximum length of a decoded sequence in
            the prediction depends on the sentences you want to lemmatize. To be precise the maximum length is
            computed as the length of the longest token in the sentences plus one.
        :param max_sequence_length: If set to True and max_sequence_length_dependend_on_input is False a fixed
            maximum length for the decoding will be used for all sentences.
        :param use_attention: whether or not to use attention. Only sensible if encoding via RNN
        """

        super().__init__()

        self._label_type = label_type
        self.beam_size = beam_size
        self.max_sequence_length = max_sequence_length
        self.dependent_on_input = max_sequence_length_dependent_on_input
        self.start_symbol = start_symbol_for_encoding
        self.end_symbol = end_symbol_for_encoding
        self.bi_encoding = bidirectional_encoding
        self.rnn_hidden_size = rnn_hidden_size

        # whether to encode characters and whether to use attention (attention can only be used if chars are encoded)
        self.encode_characters = encode_characters
        self.use_attention = use_attention
        if not self.encode_characters:
            self.use_attention = False

        # character dictionary for decoding and encoding
        self.char_dictionary = char_dict if isinstance(
            char_dict, Dictionary) else Dictionary.load(char_dict)

        # make sure <unk> is in dictionary for handling of unknown characters
        if not self.char_dictionary.add_unk:
            raise KeyError("<unk> must be contained in char_dict")

        # add special symbols to dictionary if necessary and save respective indices
        self.dummy_index = self.char_dictionary.add_item("<>")
        self.start_index = self.char_dictionary.add_item("<S>")
        self.end_index = self.char_dictionary.add_item("<E>")

        # ---- ENCODER ----
        # encoder character embeddings
        self.encoder_character_embedding = nn.Embedding(
            len(self.char_dictionary), rnn_input_size)

        # encoder pre-trained embeddings
        self.encoder_embeddings = embeddings

        hidden_input_size = 0
        if embeddings:
            hidden_input_size += embeddings.embedding_length
        if encode_characters:
            hidden_input_size += rnn_hidden_size
        if encode_characters and bidirectional_encoding:
            hidden_input_size += rnn_hidden_size
        self.emb_to_hidden = nn.Linear(hidden_input_size, rnn_hidden_size)

        # encoder RNN
        self.encoder_rnn = nn.GRU(
            input_size=rnn_input_size,
            hidden_size=self.rnn_hidden_size,
            batch_first=True,
            num_layers=rnn_layers,
            bidirectional=self.bi_encoding,
        )

        # additional encoder linear layer if bidirectional encoding
        if self.bi_encoding:
            self.bi_hidden_states_to_hidden_size: Optional[
                nn.Linear] = nn.Linear(2 * self.rnn_hidden_size,
                                       self.rnn_hidden_size,
                                       bias=False)
        else:
            self.bi_hidden_states_to_hidden_size = None

        # ---- DECODER ----
        # decoder: linear layers to transform vectors to and from alphabet_size
        self.decoder_character_embedding = nn.Embedding(
            len(self.char_dictionary), rnn_input_size)

        # when using attention we concatenate attention outcome and decoder hidden states
        self.character_decoder = nn.Linear(
            2 * self.rnn_hidden_size
            if self.use_attention else self.rnn_hidden_size,
            len(self.char_dictionary),
        )

        # decoder RNN
        self.rnn_input_size = rnn_input_size
        self.rnn_layers = rnn_layers

        self.decoder_rnn = nn.GRU(
            input_size=rnn_input_size,
            hidden_size=self.rnn_hidden_size,
            batch_first=True,
            num_layers=rnn_layers,
        )

        # loss and softmax
        self.loss = nn.CrossEntropyLoss(reduction="sum")
        # self.unreduced_loss = nn.CrossEntropyLoss(reduction='none')  # for prediction
        self.softmax = nn.Softmax(dim=2)

        self.to(flair.device)
Esempio n. 16
0
 def __init__(self, charPath, is_forward=True):
     self.is_forward_lm = is_forward
     self.dictionary: Dictionary = Dictionary.load(charPath)