Exemple #1
0
def score_flair_tagger(
        splits,
        data:Union[List[Sentence],Dataset],

):
    from flair.trainers import ModelTrainer, trainer
    logger = trainer.log
    logger.setLevel(logging.WARNING)

    data_splits = {split_name:[data[i] for i in split] for split_name,split in splits.items()}

    train_sentences,dev_sentences,test_sentences = data_splits['train'],data_splits['dev'],data_splits['test'],

    corpus = Corpus(
        train=train_sentences,
        dev=dev_sentences,
        test=test_sentences, name='scierc')
    tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=TAG_TYPE,
                                            locked_dropout=0.01,
                                            dropout=0.01,
                                            use_crf=True)
    trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.RMSprop)
    # print(tagger)
    # pprint([p_name for p_name, p in tagger.named_parameters()])
    save_path = 'flair_sequence_tagging/scierc-ner-%s'%multiprocessing.current_process()
    trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.01,
                  mini_batch_size=32,
                  max_epochs=19,
                  patience=3,
                  save_final_model=False
                  )
    # plotter = Plotter()
    # plotter.plot_training_curves('%s/loss.tsv' % save_path)
    # plotter.plot_weights('%s/weights.txt' % save_path)

    def flair_tagger_predict_bio(sentences: List[Sentence]):
        train_data = [[(token.text, token.tags[tagger.tag_type].value) for token in datum] for datum in sentences]
        targets = [bilou2bio([tag for token, tag in datum]) for datum in train_data]

        pred_sentences = tagger.predict(sentences)
        pred_data = [bilou2bio([token.tags[tagger.tag_type].value for token in datum]) for datum in pred_sentences]


        return pred_data,targets

    return {
        'train':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.train),
        'test':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.test)
    }
Exemple #2
0
    def fit(self, X, y):
        """ Build feature vectors and train FLAIR model.

            Parameters
            ----------
            X : list(list(str))
                list of sentences. Sentences are tokenized into list 
                of words.
            y : list(list(str))
                list of list of BIO tags.

            Returns
            -------
            self
        """
        log.info("Creating FLAIR corpus...")
        Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1)
        sents_train = self._convert_to_flair(Xtrain, ytrain)
        sents_val = self._convert_to_flair(Xval, yval)
        corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus")

        tag_dict = corpus_train.make_tag_dictionary(tag_type="ner")

        if self.embeddings is None:
            embedding_types = [
                WordEmbeddings("glove"),
                CharacterEmbeddings()    
            ]
            self.embeddings = StackedEmbeddings(embeddings=embedding_types)

        log.info("Building FLAIR NER...")
        self.model_ = SequenceTagger(hidden_size=self.hidden_dim,
            embeddings=self.embeddings,
            tag_dictionary=tag_dict,
            tag_type="ner",
            use_crf=self.use_crf,
            use_rnn=self.use_rnn,
            rnn_layers=self.num_rnn_layers,
            dropout=self.dropout,
            word_dropout=self.word_dropout,
            locked_dropout=self.locked_dropout)

        log.info("Training FLAIR NER...")
        opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam
        trainer = ModelTrainer(self.model_, corpus_train, opt)
        trainer.train(base_path=self.basedir,
            learning_rate=self.learning_rate,
            mini_batch_size=self.batch_size,
            max_epochs=self.max_iter)

        return self
Exemple #3
0
def train_seqtagger(train_data:Dataset,
                    dev_data:Dataset,
                    test_data:Dataset
                    ):
    corpus = Corpus(
        train=train_data,
        dev=dev_data,
        test=test_data,
        name='scierc')

    pprint(Counter([tok.tags[TAG_TYPE].value for sent in corpus.train for tok in sent]))

    tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE)
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove')]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=TAG_TYPE,
                                            locked_dropout=0.01,
                                            dropout=0.01,
                                            use_crf=True)

    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus,optimizer=torch.optim.Adam)

    save_path = 'sequence_tagging/resources/taggers/scierc-ner'
    trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.01,
                  mini_batch_size=32,
                  max_epochs=20)

    # plotter = Plotter()
    # plotter.plot_training_curves('%s/loss.tsv' % save_path)
    # plotter.plot_weights('%s/weights.txt' % save_path)

    from sequence_tagging.evaluate_flair_tagger import evaluate_sequence_tagger
    pprint('train-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.train)['f1-macro'])
    pprint('dev-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.dev)['f1-macro'])
    pprint('test-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.test)['f1-macro'])
    return tagger
    def _train(self, corpus: Corpus, params: dict, base_path: Path,
               max_epochs: int, optimization_value: str):
        """
        trains a sequence tagger model
        :param params: dict containing the parameters
        :return: dict containing result and configuration
        """

        corpus = corpus

        tag_dictionary = corpus.make_tag_dictionary(self.tag_type)

        tagger = self._set_up_model(params=params,
                                    tag_dictionary=tag_dictionary)

        training_params = {
            key: params[key]
            for key, value in params.items() if key in TRAINING_PARAMETERS
        }
        model_trainer_parameters = {
            key: params[key]
            for key, value in params.items()
            if key in MODEL_TRAINER_PARAMETERS and key != 'model'
        }

        trainer: ModelTrainer = ModelTrainer(tagger, corpus,
                                             **model_trainer_parameters)

        path = base_path

        results = trainer.train(path, max_epochs=max_epochs, **training_params)

        if optimization_value == "score":
            result = results['test_score']
        else:
            result = results['dev_loss_history'][-1]

        return {'result': result, 'params': params}
Exemple #5
0
search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[bert, elmo, flair_normal, flair_pooled])

#other hyperparams are kept fixed for this excercise.
#Add to the lists to add to grid
#unfortunately for small grids, Flair picks random search instead of true
#grid search

search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[384])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1])
search_space.add(Parameter.DROPOUT, hp.choice, options=[0.0])
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[.1])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True])

corpus = Corpus(train_data, test_data, dev_data)
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

param_selector = SequenceTaggerParamSelector(
    corpus,
    tag_type='ner',
    base_path='tuning/results',
    max_epochs=55,
    training_runs=1,
    optimization_value=OptimizationValue.DEV_SCORE)

#start the search
param_selector.optimize(search_space)
Exemple #6
0
def build_tag_dict(sequences: List[TaggedSequence], tag_type):
    sentences = build_flair_sentences_from_sequences(sequences)
    corpus = Corpus(train=sentences, dev=[], test=[])
    return corpus.make_tag_dictionary(tag_type)
Exemple #7
0
for c in corpora_paths:
    train = tsv.TSVDataset(
        Path(c),
        columns,
        tag_to_bioes=None,
        comment_symbol=None,
        in_memory=True,
        encoding="utf-8",
        document_separator_token=None
    )
    cs.append(train)
    print(train)
    
cd = ConcatDataset(cs)
cc=Corpus(cd, flair.datasets.SentenceDataset([]), flair.datasets.SentenceDataset([]))
tag_dictionary = cc.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)
# sys.exit()


corpus: Corpus = tsv.TSVCorpus(args.data_folder, columns,
                               train_file=args.train)



# TODO: downsample - test 50% for trianig
# corpus.downsample(0.5, only_downsample_train=True)
corpus = corpus.downsample(args.downsample, only_downsample_train=args.downsample_train)
print(corpus)