Esempio n. 1
0
def test_plotting_training_curves_and_weights(resources_path):
    plotter = Plotter()
    plotter.plot_training_curves(resources_path / "visual/loss.tsv")
    plotter.plot_weights(resources_path / "visual/weights.txt")

    # clean up directory
    (resources_path / "visual/weights.png").unlink()
    (resources_path / "visual/training.png").unlink()
Esempio n. 2
0
def test_plotting_training_curves_and_weights():
    plotter = Plotter()
    plotter.plot_training_curves('./resources/visual/loss.tsv')
    plotter.plot_weights('./resources/visual/weights.txt')

    # clean up directory
    os.remove('./resources/visual/weights.png')
    os.remove('./resources/visual/training.png')
Esempio n. 3
0
    def plot_curve(self,
                   traing_curve_path=os.path.normpath(
                       r'./resources/taggers/slow_bert/loss.tsv'),
                   weights_path=os.path.normpath(
                       r'./resources/taggers/slow_bert/loss.tsv')):

        from flair.visual.training_curves import Plotter
        plotter = Plotter()

        plotter.plot_training_curves(traing_curve_path)
        plotter.plot_weights(weights_path)
Esempio n. 4
0
def main(args):
    args = parser.parse_args()

    # 1. get the corpus
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(
        args.data_dir[0],
        train_file='train.txt',
        dev_file='dev.txt',
        test_file='test.txt')

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. make a list of word embeddings
    word_embeddings = [
        WordEmbeddings('glove'),

        # comment in flair embeddings for state-of-the-art results
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward'),
        # ELMoEmbeddings()
    ]

    # 4. init document embedding by passing list of word embeddings
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        word_embeddings,
        hidden_size=128,
        reproject_words=True,
        reproject_words_dimension=64,
    )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    # 6. initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # 7. start the training
    model_out = 'resources/classifiers/sentence-classification/glove'
    trainer.train(model_out,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  max_epochs=100)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(join(model_out, 'loss.tsv'))
    plotter.plot_weights(join(model_out, 'weights.txt'))
Esempio n. 5
0
def find_learning_rate(trainer, params):

    learning_rate_tsv = trainer.find_learning_rate(
        os.path.join("hyperparameter_search", params['model_output_dirpath']),
        'learning_rate_search_log.tsv',
        iterations=400,
        stop_early=False,
        mini_batch_size=16)

    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_learning_rate(learning_rate_tsv)
Esempio n. 6
0
    def __init__(self, corpus_name: str):

        corpus = NLPTaskDataFetcher.load_column_corpus(
            loc.abs_path([loc.ASSETS, loc.MODELS, loc.DIRKSON]), {
                0: 'text',
                1: 'ner'
            },
            train_file=corpus_name + loc.DIRKSON_VALIDATION_TXT,
            test_file=corpus_name + loc.DIRKSON_TEST_TXT)

        embedding_types = [
            BertEmbeddings('bert-base-uncased'),
            FlairEmbeddings('mix-forward'),
            FlairEmbeddings('mix-backward')
        ]

        tag_type = 'ner'
        embeddings = StackedEmbeddings(embeddings=embedding_types)
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type,
                                                use_crf=True)

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        if not path.exists:
            os.mkdir(
                loc.abs_path(
                    [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]))
        trainer.train(loc.abs_path(
            [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]),
                      learning_rate=0.1,
                      mini_batch_size=32,
                      max_epochs=150)

        plotter = Plotter()
        plotter.plot_training_curves(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.LOSS_TSV
            ]))
        plotter.plot_weights(
            loc.abs_path([
                loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name,
                loc.WEIGHTS_TXT
            ]))
Esempio n. 7
0
    def train(self,
              trainfile,
              devfile,
              testfile,
              resfolder,
              embtype="bert",
              chunk_len=100,
              batch_len=8):
        """
        *** This method can be used to train new models with the settings used in project Redewiedergabe
        It is not accessible from rwtagger_script and not documented in detail. Use at your own risk. ;-)
        ***
        :param trainfile:
        :param devfile:
        :param testfile:
        :param resfolder:
        :param embtype:
        :param chunk_len:
        :param batch_len:
        :return:
        """
        emb_name, embeddings = self._get_embeddings(embtype)

        corpus: Corpus = self.create_corpus(trainfile, devfile, testfile,
                                            chunk_len)
        tag_dictionary = corpus.make_tag_dictionary(tag_type="cat")

        if not os.path.exists(resfolder):
            os.makedirs(resfolder)

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type="cat",
                                                use_crf=True,
                                                rnn_layers=2)
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(resfolder,
                      learning_rate=0.1,
                      mini_batch_size=batch_len,
                      max_epochs=150,
                      checkpoint=True)
        # plot training curves
        plotter = Plotter()
        plotter.plot_training_curves(os.path.join(resfolder, 'loss.tsv'))
        plotter.plot_weights(os.path.join(resfolder, 'weights.txt'))
Esempio n. 8
0
def train(data_folder, model_output_folder):

    corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(
        NLPTask.CONLL_03, base_path=data_folder)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # init Flair embeddings
    flair_forward_embedding = FlairEmbeddings('multi-forward')
    flair_backward_embedding = FlairEmbeddings('multi-backward')

    # init multilingual BERT
    bert_embedding = BertEmbeddings('bert-base-multilingual-cased')

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        flair_forward_embedding, flair_backward_embedding, bert_embedding
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)
    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train(model_output_folder, mini_batch_size=256, max_epochs=150)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(model_output_folder + '/loss.tsv')
    plotter.plot_weights(model_output_folder + '/weights.txt')
Esempio n. 9
0
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size):
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'}
    # init a corpus using column format, data folder and the names of the train and test files
    # 1. get the corpus
    corpus: Corpus = ColumnCorpus(data_folder, columns,
                                  train_file='train',
                                  test_file='test',
                                  dev_file=None)
    log.info(corpus)
    # 2. what tag do we want to predict
    tag_type = 'pos'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    log.info(tag_dictionary)
    # 4. initialize embeddings
    local_model_path = use_scratch_dir_if_available('resources/polish_FastText_embeddings')
    embedding_types: List[TokenEmbeddings] = [
        FlairEmbeddings('pl-forward', chars_per_chunk=64),
        FlairEmbeddings('pl-backward', chars_per_chunk=64),
        OneHotEmbeddings(corpus=corpus, field='is_separator', embedding_length=3, min_freq=3),
        OneHotEmbeddings(corpus=corpus, field='proposed_tags',
                         embedding_length=math.ceil((proposed_tags_vocabulary_size + 1)**0.25),
                         min_freq=3),
        WordEmbeddings(local_model_path) if os.path.exists(local_model_path) else WordEmbeddings('pl')
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
    # 5. initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=False,
                                            rnn_layers=2)
    # 6. initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(use_scratch_dir_if_available('resources_pol_eval/taggers/example-pos/'),
                  learning_rate=0.1,
                  mini_batch_size=32,
                  embeddings_storage_mode='gpu',
                  max_epochs=sys.maxsize,
                  monitor_test=True)
    # 8. plot weight traces (optional)
    plotter = Plotter()
    plotter.plot_weights(use_scratch_dir_if_available('resources_pol_eval/taggers/example-pos/weights.txt'))
Esempio n. 10
0
    def __init__(self, path: Union[Path, str], model: str = 'final-model.pt'):
        if type(path) == str:
            path = Path(path)
        assert path.exists()

        self.path = path
        self.model = SequenceTagger.load(path / model)
        self.cv_results = {}
        for file in ['summary', 'details']:
            try:
                self.cv_results[file] = pickle.load(
                    (path / (file + '.pkl')).open(mode='rb'))
            except FileNotFoundError:
                print(
                    f"{file+'.pkl'} not found. Setting cv_results['{file}'] to None"
                )

        self.plotter = Plotter()
Esempio n. 11
0
    def train(
        self,
        output_dir: Union[Path, str],
        learning_rate: float = 0.07,
        mini_batch_size: int = 32,
        anneal_factor: float = 0.5,
        patience: int = 5,
        max_epochs: int = 150,
        plot_weights: bool = False,
        **kwargs,
    ) -> None:
        """
        Train the Sequence Classifier

        * **output_dir** - The output directory where the model predictions and checkpoints will be written.
        * **learning_rate** - The initial learning rate
        * **mini_batch_size** - Batch size for the dataloader
        * **anneal_factor** - The factor by which the learning rate is annealed
        * **patience** - Patience is the number of epochs with no improvement the Trainer waits until annealing the learning rate
        * **max_epochs** - Maximum number of epochs to train. Terminates training if this number is surpassed.
        * **plot_weights** - Bool to plot weights or not
        * **kwargs** - Keyword arguments for the rest of Flair's `Trainer.train()` hyperparameters
        """
        if isinstance(output_dir, str):
            output_dir = Path(output_dir)

        # Start the training
        self.trainer.train(
            output_dir,
            learning_rate=learning_rate,
            mini_batch_size=mini_batch_size,
            anneal_factor=anneal_factor,
            patience=patience,
            max_epochs=max_epochs,
            **kwargs,
        )

        # Plot weight traces
        if plot_weights:
            plotter = Plotter()
            plotter.plot_weights(output_dir / "weights.txt")
Esempio n. 12
0
def main():
    train_dev_corpus = NLPTaskDataFetcher.load_classification_corpus(
        Path(DATA_PATH),
        train_file='flair_train.csv',
        test_file='flair_test.csv',
        dev_file='flair_dev.csv')

    label_dict = train_dev_corpus.make_label_dictionary()

    word_embeddings = [
        WordEmbeddings('crawl'),
        FlairEmbeddings('news-forward-fast', chars_per_chunk=128),
        FlairEmbeddings('news-backward-fast', chars_per_chunk=128)
    ]

    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                rnn_type='LSTM',
                                                hidden_size=128,
                                                reproject_words=True,
                                                reproject_words_dimension=64)

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, train_dev_corpus)
    trainer.train(PRETRAINED_FLAIR,
                  max_epochs=40,
                  learning_rate=0.2,
                  mini_batch_size=32,
                  embeddings_in_memory=False,
                  checkpoint=True)

    plotter = Plotter()
    plotter.plot_training_curves(FLAIR_LOSS)
    plotter.plot_weights(FLAIR_WEIGHTS)
Esempio n. 13
0
def main(train_file):

    # 1. get the corpus
    # define columns
    columns = {0: 'text', 1: '', 2: '', 3: 'ner'}

    # this is the folder in which train, test and dev files reside
    data_folder = './eng_data_mini_onefile/'

    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        data_folder,
        columns,
        train_file=train_file,
        test_file='eng.testb',
        dev_file='eng.testa')

    print(corpus)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        # CharacterEmbeddings(),

        # comment in these lines to use flair embeddings
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train('resources/taggers/example-ner',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves('resources/taggers/example-ner/loss.tsv')
    plotter.plot_weights('resources/taggers/example-ner/weights.txt')
Esempio n. 14
0
 def visualize(path: str):
     plotter = Plotter()
     plotter.plot_training_curves(path + 'loss.tsv')
     plotter.plot_weights(path + 'weights.txt')
Esempio n. 15
0
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size,
                                  skf_split_no):
    """
    Trains the sequence labeling model (by default model uses one RNN layer).
    Model is trained to predict part of speech tag and takes into account information about:
    - text (plain text made of tokens that together form a sentence),
    - occurrence of separator before token,
    - proposed tags for given token.
    It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded
    using a concatenation of two vector embeddings:
    - Flair Embeddings - contextual string embeddings that capture latent syntactic-semantic
      information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any
      explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are
      contextualized by their surrounding text, meaning that the same word will have different embeddings depending on
      its contextual use.
      There are forward (that goes through the given on input plain text form left to right) and backward model (that
      goes through the given on input plain text form right to left) used for part of speech (pos) tag training.
    - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an
      embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also
      differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of
      the most common words seen in the corpus, plus an UNK token for all rare words.
      There are two One Hot Embeddings used in training:
      - first to embed information about occurrence of separator before token,
      - second to embed information about concatenated with a ';' proposed tags.
    Model and training logs are saved in resources/taggers/example-pos directory.
    This is the method where internal states of forward and backward Flair models are taken at the end of each token
    and, supplemented by information about occurrence of separator before token and proposed tags for given token used
    to train model for one of stratified 10 fold cross validation splits.

    :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize
    ColumnCorpus object
    :param proposed_tags_vocabulary_size: number of proposed tags
    :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10)
    used to train the model
    """
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'}
    # init a corpus using column format, data folder and the names of the train and test files
    # 1. get the corpus
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train_' + str(skf_split_no),
                                  test_file='test_' + str(skf_split_no),
                                  dev_file=None)
    log.info(corpus)
    # 2. what tag do we want to predict
    tag_type = 'pos'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    log.info(tag_dictionary)
    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        FlairEmbeddings('pl-forward', chars_per_chunk=64),
        FlairEmbeddings('pl-backward', chars_per_chunk=64),
        OneHotEmbeddings(corpus=corpus,
                         field='is_separator',
                         embedding_length=3,
                         min_freq=3),
        OneHotEmbeddings(corpus=corpus,
                         field='proposed_tags',
                         embedding_length=math.ceil(
                             (proposed_tags_vocabulary_size + 1)**0.25),
                         min_freq=3)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    # 5. initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=False,
                                            rnn_layers=1)
    # 6. initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(
        use_scratch_dir_if_available('resources/taggers/example-pos/it-' +
                                     str(skf_split_no)),
        learning_rate=0.1,
        mini_batch_size=32,
        embeddings_storage_mode='gpu',
        max_epochs=sys.maxsize,
        monitor_test=True)
    # 8. plot weight traces (optional)
    plotter = Plotter()
    plotter.plot_weights(
        use_scratch_dir_if_available('resources/taggers/example-pos/it-' +
                                     str(skf_split_no) + '/weights.txt'))
Esempio n. 16
0
def train(model, selected_embeddings):
  # 1. get the corpus
  if model == 'AMT':
    corpus = read_in_AMT()
  elif model == 'CADEC':
    corpus = read_in_CADEC()
  elif model == 'TwitterADR':
    corpus = read_in_TwitterADR()
  elif model == 'Micromed':
    corpus = read_in_Micromed()
  print(corpus)

  # 2. what tag do we want to predict?
  tag_type = 'ner'

  # 3. make the tag dictionary from the corpus
  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
  print(tag_dictionary.idx2item)


  embedding_types: List[TokenEmbeddings] = [
  ]

  if selected_embeddings['glove']:
    embedding_types.append(WordEmbeddings('glove'))

  if selected_embeddings['twitter']:
    embedding_types.append(WordEmbeddings('twitter'))

  if selected_embeddings['char']:
    embedding_types.append(CharacterEmbeddings())

  # FlairEmbeddings
  if selected_embeddings['flair']:
    embedding_types.append(FlairEmbeddings('news-forward'))

  # sFlairEmbeddings
  if selected_embeddings['flair']:
    embedding_types.append(FlairEmbeddings('news-backward'))

  # PooledFlairEmbeddings
  if selected_embeddings['pooled-flair']:
    embedding_types.append(PooledFlairEmbeddings('news-forward', pooling='mean'))

  # PooledFlairEmbeddings
  if selected_embeddings['pooled-flair']:
    embedding_types.append(PooledFlairEmbeddings('news-backward', pooling='mean'))

  # init BERT
  if selected_embeddings['bert']:
    embedding_types.append(BertEmbeddings())

  # init roberta
  if selected_embeddings['roberta']:
    embedding_types.append(RoBERTaEmbeddings())

    # init  BioBERT
  if selected_embeddings['biobert']:
    embedding_types.append(BertEmbeddings("data/embeddings/biobert-pubmed-pmc-cased"))

  # init clinical BERT
  if selected_embeddings['clinicalbiobert']:
    embedding_types.append(BertEmbeddings("data/embeddings/pretrained_bert_tf/biobert-base-clinical-cased"))


  # init multilingual ELMo
  if selected_embeddings['elmo']:
    embedding_types.append(ELMoEmbeddings())



  embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)



  tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                          embeddings=embeddings,
                                          tag_dictionary=tag_dictionary,
                                          tag_type=tag_type,
                                          use_crf=True
                                          )



  trainer: ModelTrainer = ModelTrainer(tagger, corpus)

  selected_embeddings_text = [key  for key in selected_embeddings if selected_embeddings[key]]
  selected_embeddings_text = '_'.join(selected_embeddings_text)

  model_dir = 'resources/taggers/FA_' + model + selected_embeddings_text

  # 7. start training
  trainer.train(model_dir,
                train_with_dev=True,
                learning_rate=0.1,
                mini_batch_size=4,
                max_epochs=200,
                checkpoint=True)

  # 8. plot training curves (optional)
  from flair.visual.training_curves import Plotter
  plotter = Plotter()
  plotter.plot_training_curves(model_dir + '/loss.tsv')
  plotter.plot_weights(model_dir + '/weights.txt')
Esempio n. 17
0
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True,
)

# initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train(
    "resources/taggers/example-ner",
    learning_rate=0.1,
    mini_batch_size=32,
    max_epochs=1,
    shuffle=False,
)

plotter = Plotter()
plotter.plot_training_curves("resources/taggers/example-ner/loss.tsv")
plotter.plot_weights("resources/taggers/example-ner/weights.txt")
Esempio n. 18
0
 def Plot_Weights(self):
     if os.path.exists(dest_path) == False:
         print("Error, First Train your models")
         sys.exit()
     plotter = Plotter()
     plotter.plot_weights(self.dest_path + "/weights.txt")
Esempio n. 19
0
    def train_all(self):
        config_file = open(self.config, "r")
        if self.config.split('.')[-1] == "yml":
            datastore = yaml.load(config_file)
        elif self.config.split('.')[-1] == "json":
            datastore = json.loads(config_file.read())
        else:
            print("Need a json or yaml file as config")
            sys.exit(0)

        columns = {
            int(datastore["dataset_reader"]["position_text"]): "text",
            int(datastore["dataset_reader"]["position_ner"]): "ner",
        }

        # focus_on = datastore["dataset_reader"]["focus_on"]

        if bool(datastore["dataset_reader"]["only_train"]):

            all_corpus = []
            log.info("Reading data from {}".format(datastore["dataset_reader"]["data_folder"]))

            all_corpus = ColumnCorpusTrain(
                datastore["dataset_reader"]["data_folder"],
                columns,
                train_file=datastore["dataset_reader"]["train_name"],
            )

            tag_type = "ner"
            tag_dictionary = all_corpus[0].make_tag_dictionary(tag_type=tag_type)

        else:

            iobes_corpus = ColumnCorpus(
                datastore["dataset_reader"]["data_folder"],
                columns,
                train_file=datastore["dataset_reader"]["train_name"],
                dev_file=datastore["dataset_reader"]["dev_name"],
                test_file=datastore["dataset_reader"]["test_name"],
            )

            tag_type = "ner"
            tag_dictionary = iobes_corpus.make_tag_dictionary(tag_type=tag_type)

            try:
                train_ratio = float(datastore["dataset_reader"]["train_ratio"])
                iobes_corpus = Corpus(iobes_corpus.train[0:int(len(iobes_corpus.train) * train_ratio)],
                                      iobes_corpus.dev, iobes_corpus.test)
                log_ratio = "Using only ", str(train_ratio * 100), "% of the train dataset"
                log.info(log_ratio)
            except:
                pass

        embed_list = []
        word_char = []
        char_word = []
        for embed in datastore["embeddings"]["embeddings_list"]:

            if embed == "bpe":
                embed_list.append(BytePairEmbeddings(datastore["embeddings"]["lang"]))
            elif embed == "fasttext":
                embed_list.append(WordEmbeddings(datastore["embeddings"]["lang"]))
            elif embed == "flair" and datastore["embeddings"]["lang"] == "en":
                embed_list.append(FlairEmbeddings("news-forward"))
                embed_list.append(FlairEmbeddings("news-backward"))
            elif embed == "bert-base-uncased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-base-uncased"))
            elif embed == "bert-base-cased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-base-cased"))
            elif embed == "bert-large-uncased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-large-uncased"))
            elif embed == "bert-large-cased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-large-cased"))
            elif embed == "elmo-small":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("small"))
            elif embed == "elmo-medium":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("medium"))
            elif embed == "elmo-original":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("original"))
            elif embed == "bert-base-chinese":
                if datastore["embeddings"]["lang"] == "zh":
                    embed_list.append(emb.BertEmbeddingsChinese("bert-base-chinese"))
            else:
                split_name = embed.split(".")
                ext = split_name[-1]
                kind = split_name[-2]

                if ext == "pt":  # Flair type

                    extra_index = 0
                    try:
                        extra_index = int(datastore["embeddings"]["extra_index"])
                    except:
                        pass

                    if kind == "char":
                        embed_list.append(emb.FlairEmbeddingsChar(embed, extra_index=extra_index))
                    elif kind == "char-seg":
                        embed_list.append(emb.FlairEmbeddingsWordLevelCharSeg(embed, extra_index=extra_index))

                if ext == "vec":  # Char type
                    if kind == "char-seg":
                        embed_list.append(emb.WordEmbeddingsVecCharSeg(embed))
                    elif kind == "char":
                        embed_list.append(emb.WordEmbeddingsVecFirst(embed))
                    elif kind == "word":
                        embed_list.append(emb.WordEmbeddingsVecWord(embed))
                    elif kind == "bichar":
                        embed_list.append(emb.WordEmbeddingsVecBichar(embed))
                if ext == "bin":
                    if kind == "word":
                        embed_list.append(emb.WordEmbeddingsBinWord(embed))
                    elif kind == "bichar":
                        embed_list.append(emb.WordEmbeddingsBinBichar(embed))

        try:
            if bool(datastore["embeddings"]["ner_embed"]) == True:
                print("Generate NER embeddings..")
                embed_list.append(
                    emb.nerEmbedding(
                        generateNerEmbFromTrain(
                            iobes_corpus.train, tag_dictionary.get_items()
                        )
                    )
                )
        except:
            pass
        try:
            if bool(datastore["embeddings"]["one_hot"]) == True:
                print("Generate one hot embeddings..")
                embed_list.append(emb.OneHotEmbeddings(iobes_corpus))
        except:
            pass
        try:
            if datastore["embeddings"]["embeddings_ngram_list"] != None:
                embed_list.append(
                    emb.WordEmbeddingsVecNGramList(
                        datastore["embeddings"]["embeddings_ngram_list"]
                    )
                )
        except:
            pass

        if len(word_char) == 1 and len(char_word) == 1:
            embed_list.append(emb.WordEmbeddingsVecWordChar(word_char[0], char_word[0]))

        embedding_types: List[TokenEmbeddings] = embed_list

        embeddings: emb.StackedEmbeddingsNew = emb.StackedEmbeddingsNew(
            embeddings=embedding_types
        )

        if bool(datastore["dataset_reader"]["only_train"]):
            score = []
            for i in range(len(all_corpus)):

                tagger: SequenceTagger = SequenceTagger(
                    hidden_size=int(datastore["model"]["hidden_size"]),
                    embeddings=embeddings,
                    tag_dictionary=tag_dictionary,
                    tag_type=tag_type,
                    use_crf=bool(datastore["model"]["use_crf"]),
                    dropout=float(datastore["model"]["dropout"]),
                    word_dropout=float(datastore["model"]["word_dropout"]),
                    locked_dropout=float(datastore["model"]["locked_dropout"]),
                    rnn_layers=int(datastore["model"]["rnn_layers"]),
                )

                folder = datastore["train_config"]["folder"] + "/" + str(i)
                best = Path(folder + "/checkpoint.pt")
                iobes_corpus = all_corpus[i]
                if not best.exists():
                    best = Path(folder + "/best-model.pt")

                if best.exists():
                    trainer = ModelTrainer.load_checkpoint(
                        tagger.load_checkpoint(best), iobes_corpus
                    )
                else:
                    trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus)

                # 7. start training

                result = trainer.train(
                    folder,
                    learning_rate=float(datastore["train_config"]["learning_rate"]),
                    anneal_factor=float(datastore["train_config"]["anneal_factor"]),
                    min_learning_rate=float(datastore["train_config"]["min_learning_rate"]),
                    mini_batch_size=int(datastore["train_config"]["batch_size"]),
                    max_epochs=int(datastore["train_config"]["epoch"]),
                    save_final_model=bool(datastore["train_config"]["save_final_model"]),
                    checkpoint=bool(datastore["train_config"]["checkpoint"]),
                    param_selection_mode=bool(
                        datastore["train_config"]["param_selection_mode"]
                    ),
                    patience=int(datastore["train_config"]["patience"]),
                    monitor_test=bool(datastore["train_config"]["monitor_test"]),
                    embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]),
                    shuffle=bool(datastore["train_config"]["shuffle"]),
                )

                plotter = Plotter()
                if bool(datastore["train_config"]["save_plot_training_curve"]):
                    curve = folder + "/loss.tsv"
                    plotter.plot_training_curves(curve)
                if bool(datastore["train_config"]["save_plot_weights"]):
                    weight = folder + "/weights.txt"
                    plotter.plot_weights(weight)

                score.append(result["test_score"])

            print(score, "  \n Moyenne : ", round(sum(score) / len(score), 2))


        else:

            tagger: SequenceTagger = SequenceTagger(
                hidden_size=int(datastore["model"]["hidden_size"]),
                embeddings=embeddings,
                tag_dictionary=tag_dictionary,
                tag_type=tag_type,
                use_crf=bool(datastore["model"]["use_crf"]),
                dropout=float(datastore["model"]["dropout"]),
                word_dropout=float(datastore["model"]["word_dropout"]),
                locked_dropout=float(datastore["model"]["locked_dropout"]),
                rnn_layers=int(datastore["model"]["rnn_layers"]),
            )

            folder = datastore["train_config"]["folder"]
            best = Path(folder + "/checkpoint.pt")
            if not best.exists():
                best = Path(folder + "/best-model.pt")

            if best.exists():
                trainer = ModelTrainer.load_checkpoint(
                    tagger.load_checkpoint(best), iobes_corpus
                )
            else:
                trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus)

            # 7. start training

            trainer.train(
                folder,
                learning_rate=float(datastore["train_config"]["learning_rate"]),
                anneal_factor=float(datastore["train_config"]["anneal_factor"]),
                min_learning_rate=float(datastore["train_config"]["min_learning_rate"]),
                mini_batch_size=int(datastore["train_config"]["batch_size"]),
                max_epochs=int(datastore["train_config"]["epoch"]),
                save_final_model=bool(datastore["train_config"]["save_final_model"]),
                checkpoint=bool(datastore["train_config"]["checkpoint"]),
                param_selection_mode=bool(
                    datastore["train_config"]["param_selection_mode"]
                ),
                patience=int(datastore["train_config"]["patience"]),
                monitor_test=bool(datastore["train_config"]["monitor_test"]),
                embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]),
                shuffle=bool(datastore["train_config"]["shuffle"]),
            )

            plotter = Plotter()
            if bool(datastore["train_config"]["save_plot_training_curve"]):
                curve = folder + "/loss.tsv"
                plotter.plot_training_curves(curve)
            if bool(datastore["train_config"]["save_plot_weights"]):
                weight = folder + "/weights.txt"
                plotter.plot_weights(weight)
Esempio n. 20
0
def run_experiments(input_dir: Path, output_dir: Path):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus = ColumnCorpus(
        input_dir,
        {0: 'text', 1: 'dep', 2: 'aspect'},
        train_file='Laptops_poria-train.conll',
        # train_file='Restaurants_poria-train.conll',
        test_file='Laptops_poria-test.conll',
        # test_file='Restaurants_poria-test.conll',
        dev_file='Laptops_poria-train.conll'
        # dev_file='Restaurants_poria-train.conll'
    )

    # 2. what tag do we want to predict?
    tag_type = 'aspect'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    all_embedding_to_test = {
        # 'glove+aspects': [
        #     WordEmbeddings('glove'),
        #     WordEmbeddings(
        #         (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix()
        #     ),
        # ],
        # 'glove': [
        #     WordEmbeddings('glove'),
        # ],
        # 'charlmembedding': [
        #     FlairEmbeddings('news-forward'),
        #     FlairEmbeddings('news-backward'),
        # ],
        # 'glove-simple-char': [
        #     WordEmbeddings('glove'),
        #     CharacterEmbeddings(),
        # ],
        'bert+aspects': [
            BertEmbeddings('bert-large-cased'),
            WordEmbeddings(
                (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix()
            )
        ],
        'bert': [
            BertEmbeddings('bert-large-cased'),
        ],
        # 'elmo': [
        #     ELMoEmbeddings('original')
        # ]
    }

    for name, embeddings_to_stack in tqdm(
            all_embedding_to_test.items(),
            desc='Different embeddings stacked',
            total=len(all_embedding_to_test)
    ):
        results_folder = Path(DEFAULT_OUTPUT_PATH / f'sequence-tagging/aspects/laptops-{name}')
        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings_to_stack)

        # 5. initialize sequence tagger
        tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=tag_type,
            use_crf=True
        )

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        # 7. start training
        trainer.train(
            results_folder.as_posix(),
            learning_rate=0.1,
            mini_batch_size=32,
            max_epochs=150
        )

        # 8. plot training curves (optional)
        plotter = Plotter()
        plotter.plot_training_curves(results_folder / 'loss.tsv')
        plotter.plot_weights(results_folder / 'weights.txt')
Esempio n. 21
0
def main():

    datasets = os.listdir("./datasets")
    print(datasets)
    language = "fr"

    nb_cells = 32
    dataset = "DESFOSSE_ARRAY"

    exp_name = dataset + "_" + str(nb_cells)
    # 1. get the corpus
    columns = {0: 'text', 1: 'position', 2: "array", 3: "line", 4: "col"}

    # this is the folder in which train, test and dev files reside
    data_folder = './datasets/' + dataset

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file="train_" + dataset + '.txt',
                                  test_file="test_" + dataset + '.txt',
                                  dev_file="valid_" + dataset + '.txt')

    print(corpus)

    # 2. what tag do we want to predict?
    tag_type = "col"

    exp_name = dataset + "_" + tag_type

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # initialize embeddings
    embedding_types: List[TokenEmbeddings] = []
    embedding_types.append(FlairEmbeddings(language + '-forward'))
    embedding_types.append(FlairEmbeddings(language + '-backward'))
    embedding_types.append(FloatsEmbeddings(field='position', length=4))

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # initialize sequence tagger
    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=nb_cells,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
        use_crf=True,
    )

    # initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        "resources/taggers/" + exp_name,
        learning_rate=0.1,
        embeddings_storage_mode="cpu",
        mini_batch_size=32,
        max_epochs=150,
        shuffle=False,
    )

    plotter = Plotter()
    plotter.plot_training_curves("resources/taggers/" + exp_name + "/loss.tsv")
    plotter.plot_weights("resources/taggers/" + exp_name + "/weights.txt")

    predict_tagger(setId, nb_cells, rubric, rubric)
Esempio n. 22
0
def test_plotting_training_curves_and_weights(resources_path):
    plotter = Plotter()
    plotter.plot_training_curves((resources_path / u'visual/loss.tsv'))
    plotter.plot_weights((resources_path / u'visual/weights.txt'))
    (resources_path / u'visual/weights.png').unlink()
    (resources_path / u'visual/training.png').unlink()
Esempio n. 23
0
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "fasttext":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('it')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-uncased')
    elif stack == "bert-multi":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased')
    elif stack == 'bpe':
        from flair.embeddings import BytePairEmbeddings
        stacked_embedding = BytePairEmbeddings('it')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('it-forward'),
            FlairEmbeddings('it-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=256,
        reproject_words=True,
        dropout=0.5,
        reproject_words_dimension=256,
    )

    #document_embeddings = DocumentPoolEmbeddings([
    #    stacked_embedding,
    #    FlairEmbeddings('it-forward'),
    #    FlairEmbeddings('it-backward')],pooling='mean')

    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=True)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        #checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(
        file_path,
        max_epochs=n_epochs,
        checkpoint=True,
    )

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path + '/loss.tsv')
    plotter.plot_weights(file_path + '/weights.txt')
Esempio n. 24
0
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size,
                                  skf_split_no):
    """
    Trains the sequence labeling model (by default model uses one RNN layer).
    Model is trained to predict part of speech tag and takes into account information about:
    - text (plain text made of tokens that together form a sentence),
    - occurrence of separator before token,
    - proposed tags for given token.
    It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded
    using a concatenation of three vector embeddings:
    - WordEmbeddings - classic word embeddings. That kind of embeddings are static and word-level, meaning that each
      distinct word gets exactly one pre-computed embedding. Here FastText embeddings trained over polish Wikipedia are
      used.
    - CharacterEmbeddings - allow to add character-level word embeddings during model training. These embeddings are
      randomly initialized when the class is being initialized, so they are not meaningful unless they are trained on
      a specific downstream task. For instance, the standard sequence labeling architecture used by Lample et al. (2016)
      is a combination of classic word embeddings with task-trained character features. Normally this would require to
      implement a hierarchical embedding architecture in which character-level embeddings for each word are computed
      using an RNN and then concatenated with word embeddings. In Flair, this is simplified by treating
      CharacterEmbeddings just like any other embedding class. To reproduce the Lample architecture, there is only
      a need to combine them with standard WordEmbeddings in an embedding stack.
    - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an
      embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also
      differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of
      the most common words seen in the corpus, plus an UNK token for all rare words.
      There are one One Hot Embeddings used in training: to embed information about proposed tags (concatenated
      with a ';') and appearance of separator before each token.
    Model training is based on stratified 10 fold cross validation split indicated by skf_split_no argument.
    Model and training logs are saved in resources_ex_3/taggers/example-pos/it-<skf_split_no> directory (where
    <skf_split_no> is the number of stratified 10 fold cross validation split used to train the model).

    :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize
    ColumnCorpus object
    :param proposed_tags_vocabulary_size: number of proposed tags
    :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10)
    used to train the model
    """
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'}
    # init a corpus using column format, data folder and the names of the train and test files
    # 1. get the corpus
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train_' + str(skf_split_no),
                                  test_file='test_' + str(skf_split_no),
                                  dev_file=None)
    log.info(corpus)
    # 2. what tag do we want to predict
    tag_type = 'pos'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    log.info(tag_dictionary)
    # 4. initialize embeddings
    local_model_path = use_scratch_dir_if_available(
        'resources/polish_FastText_embeddings')
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings(local_model_path)
        if os.path.exists(local_model_path) else WordEmbeddings('pl'),
        CharacterEmbeddings(
            use_scratch_dir_if_available('resources/polish_letters_dict')),
        OneHotEmbeddings(corpus=corpus,
                         field='is_separator',
                         embedding_length=3,
                         min_freq=3),
        OneHotEmbeddings(corpus=corpus,
                         field='proposed_tags',
                         embedding_length=math.ceil(
                             (proposed_tags_vocabulary_size + 1)**0.25),
                         min_freq=3)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    # 5. initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=False,
                                            rnn_layers=1)
    # 6. initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(
        use_scratch_dir_if_available('resources_ex_3/taggers/example-pos/it-' +
                                     str(skf_split_no)),
        learning_rate=0.1,
        mini_batch_size=32,
        embeddings_storage_mode='gpu',
        max_epochs=sys.maxsize,
        monitor_test=True)
    # 8. plot weight traces (optional)
    plotter = Plotter()
    plotter.plot_weights(
        use_scratch_dir_if_available('resources_ex_3/taggers/example-pos/it-' +
                                     str(skf_split_no) + '/weights.txt'))
Esempio n. 25
0
def train_tagger(options):
    # Define columns
    columns = {1: 'text', 2: 'pos', 3: 'ner'}

    # What tag should be predicted?
    tag_type = 'ner'

    # Folder in which train, test and dev files reside
    data_folder = options.iob_dir + '/' + options.correction_mode

    # Folder in which to save tagging model and additional information
    tagger_folder = '/'.join([
        options.tagger_dir, options.ner_cycle, options.lm_domain,
        options.correction_mode
    ]) + '-stringemb'

    # Retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        data_folder,
        columns,
        train_file='train.txt',
        test_file='test.txt',
        dev_file='dev.txt')

    # Make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # Initialize embeddings
    char_embeddings = [
        FlairEmbeddings(options.lm_dir + options.lm_domain + '-fw/best-lm.pt',
                        use_cache=False),
        FlairEmbeddings(options.lm_dir + options.lm_domain + '-bw/best-lm.pt',
                        use_cache=False)
    ]

    if not options.use_wiki_wordemb:
        if not options.use_press_wordemb:
            embedding_types: List[TokenEmbeddings] = char_embeddings
        else:
            embedding_types: List[TokenEmbeddings] = [
                WordEmbeddings(
                    'resources.d/embeddings/fasttext/pressfr-wikifr')
            ] + char_embeddings
            tagger_folder = tagger_folder + '-wordemb-pr'
    else:
        embedding_types: List[TokenEmbeddings] = [WordEmbeddings('fr')
                                                  ] + char_embeddings
        tagger_folder = tagger_folder + '-wordemb'

    if options.use_crf:
        tagger_folder = tagger_folder + '-crf'

    # Print information
    print(tagger_folder)
    print(corpus)
    print(tag_dictionary.idx2item)

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # Initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=options.use_crf)

    # Initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # Start training
    trainer.train(
        tagger_folder,
        learning_rate=0.1,
        mini_batch_size=32,
        max_epochs=50,
        patience=options.train_patience,
        #train_with_dev=True,
        anneal_against_train_loss=False,
        embeddings_in_memory=False)

    # Plot training curves (optional)
    plotter = Plotter()
    plotter.plot_training_curves(tagger_folder + '/loss.tsv')
    plotter.plot_weights(tagger_folder + '/weights.txt')
def create_train_plot(model_save_path):
    """ Plot train logs """

    plotter = Plotter()
    plotter.plot_weights(model_save_path + 'weights.txt')