def score_flair_tagger( splits, data:Union[List[Sentence],Dataset], ): from flair.trainers import ModelTrainer, trainer logger = trainer.log logger.setLevel(logging.WARNING) data_splits = {split_name:[data[i] for i in split] for split_name,split in splits.items()} train_sentences,dev_sentences,test_sentences = data_splits['train'],data_splits['dev'],data_splits['test'], corpus = Corpus( train=train_sentences, dev=dev_sentences, test=test_sentences, name='scierc') tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=TAG_TYPE, locked_dropout=0.01, dropout=0.01, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.RMSprop) # print(tagger) # pprint([p_name for p_name, p in tagger.named_parameters()]) save_path = 'flair_sequence_tagging/scierc-ner-%s'%multiprocessing.current_process() trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.01, mini_batch_size=32, max_epochs=19, patience=3, save_final_model=False ) # plotter = Plotter() # plotter.plot_training_curves('%s/loss.tsv' % save_path) # plotter.plot_weights('%s/weights.txt' % save_path) def flair_tagger_predict_bio(sentences: List[Sentence]): train_data = [[(token.text, token.tags[tagger.tag_type].value) for token in datum] for datum in sentences] targets = [bilou2bio([tag for token, tag in datum]) for datum in train_data] pred_sentences = tagger.predict(sentences) pred_data = [bilou2bio([token.tags[tagger.tag_type].value for token in datum]) for datum in pred_sentences] return pred_data,targets return { 'train':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.train), 'test':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.test) }
def fit(self, X, y): """ Build feature vectors and train FLAIR model. Parameters ---------- X : list(list(str)) list of sentences. Sentences are tokenized into list of words. y : list(list(str)) list of list of BIO tags. Returns ------- self """ log.info("Creating FLAIR corpus...") Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1) sents_train = self._convert_to_flair(Xtrain, ytrain) sents_val = self._convert_to_flair(Xval, yval) corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus") tag_dict = corpus_train.make_tag_dictionary(tag_type="ner") if self.embeddings is None: embedding_types = [ WordEmbeddings("glove"), CharacterEmbeddings() ] self.embeddings = StackedEmbeddings(embeddings=embedding_types) log.info("Building FLAIR NER...") self.model_ = SequenceTagger(hidden_size=self.hidden_dim, embeddings=self.embeddings, tag_dictionary=tag_dict, tag_type="ner", use_crf=self.use_crf, use_rnn=self.use_rnn, rnn_layers=self.num_rnn_layers, dropout=self.dropout, word_dropout=self.word_dropout, locked_dropout=self.locked_dropout) log.info("Training FLAIR NER...") opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam trainer = ModelTrainer(self.model_, corpus_train, opt) trainer.train(base_path=self.basedir, learning_rate=self.learning_rate, mini_batch_size=self.batch_size, max_epochs=self.max_iter) return self
def train_seqtagger(train_data:Dataset, dev_data:Dataset, test_data:Dataset ): corpus = Corpus( train=train_data, dev=dev_data, test=test_data, name='scierc') pprint(Counter([tok.tags[TAG_TYPE].value for sent in corpus.train for tok in sent])) tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE) print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove')] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=TAG_TYPE, locked_dropout=0.01, dropout=0.01, use_crf=True) from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus,optimizer=torch.optim.Adam) save_path = 'sequence_tagging/resources/taggers/scierc-ner' trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.01, mini_batch_size=32, max_epochs=20) # plotter = Plotter() # plotter.plot_training_curves('%s/loss.tsv' % save_path) # plotter.plot_weights('%s/weights.txt' % save_path) from sequence_tagging.evaluate_flair_tagger import evaluate_sequence_tagger pprint('train-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.train)['f1-macro']) pprint('dev-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.dev)['f1-macro']) pprint('test-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.test)['f1-macro']) return tagger
def _train(self, corpus: Corpus, params: dict, base_path: Path, max_epochs: int, optimization_value: str): """ trains a sequence tagger model :param params: dict containing the parameters :return: dict containing result and configuration """ corpus = corpus tag_dictionary = corpus.make_tag_dictionary(self.tag_type) tagger = self._set_up_model(params=params, tag_dictionary=tag_dictionary) training_params = { key: params[key] for key, value in params.items() if key in TRAINING_PARAMETERS } model_trainer_parameters = { key: params[key] for key, value in params.items() if key in MODEL_TRAINER_PARAMETERS and key != 'model' } trainer: ModelTrainer = ModelTrainer(tagger, corpus, **model_trainer_parameters) path = base_path results = trainer.train(path, max_epochs=max_epochs, **training_params) if optimization_value == "score": result = results['test_score'] else: result = results['dev_loss_history'][-1] return {'result': result, 'params': params}
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[bert, elmo, flair_normal, flair_pooled]) #other hyperparams are kept fixed for this excercise. #Add to the lists to add to grid #unfortunately for small grids, Flair picks random search instead of true #grid search search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[384]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1]) search_space.add(Parameter.DROPOUT, hp.choice, options=[0.0]) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[.1]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True]) corpus = Corpus(train_data, test_data, dev_data) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) param_selector = SequenceTaggerParamSelector( corpus, tag_type='ner', base_path='tuning/results', max_epochs=55, training_runs=1, optimization_value=OptimizationValue.DEV_SCORE) #start the search param_selector.optimize(search_space)
def build_tag_dict(sequences: List[TaggedSequence], tag_type): sentences = build_flair_sentences_from_sequences(sequences) corpus = Corpus(train=sentences, dev=[], test=[]) return corpus.make_tag_dictionary(tag_type)
for c in corpora_paths: train = tsv.TSVDataset( Path(c), columns, tag_to_bioes=None, comment_symbol=None, in_memory=True, encoding="utf-8", document_separator_token=None ) cs.append(train) print(train) cd = ConcatDataset(cs) cc=Corpus(cd, flair.datasets.SentenceDataset([]), flair.datasets.SentenceDataset([])) tag_dictionary = cc.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # sys.exit() corpus: Corpus = tsv.TSVCorpus(args.data_folder, columns, train_file=args.train) # TODO: downsample - test 50% for trianig # corpus.downsample(0.5, only_downsample_train=True) corpus = corpus.downsample(args.downsample, only_downsample_train=args.downsample_train) print(corpus)