Beispiel #1
0
def build_and_train_conll03en_flair_sequence_tagger(corpus,tag_type,tag_dictionary):
    '''
    do not change!
    same configuration as described in
      file:  "flair/resources/docs/EXPERIMENTS.md"
      section: "CoNLL-03 Named Entity Recognition (English)"
    '''
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=[
            WordEmbeddings("glove"),
            PooledFlairEmbeddings("news-forward", pooling="min"),
            PooledFlairEmbeddings("news-backward", pooling="min"),
        ]
    )
    from flair.models import SequenceTagger
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
    )

    from flair.trainers import ModelTrainer

    corpus = Corpus(train=corpus.train, dev=corpus.dev,test=[])
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # trainer.train("resources/taggers/example-ner", train_with_dev=True, max_epochs=150) # original
    trainer.train("flair_checkpoints", train_with_dev=False, max_epochs=40,save_final_model=False) # original

    return tagger
Beispiel #2
0
def main(base_path, output_dir, nb_epochs):
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--data_dir", default='./', type=str, required=True, help="The parent dir of inpu data, must contain folder name `conll_03`")
    # parser.add_argument("--output_dir", default=None, required=True, help="The output directory where is going to store the trained model")
    # parser.add_argument("--train_epochs", default=3, type=int, required=True, help="Number of epochs to train")
    # args = parser.parse_args()
    # base_path = args.data_dir
    corpus: Corpus = CONLL_03(base_path=base_path)
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        PooledFlairEmbeddings('news-forward', pooling='min'),
        PooledFlairEmbeddings('news-backward', pooling='min'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # output_dir = args.output_dir
    # nb_epochs = args.train_epochs
    # output_dir =
    # nb_epochs = 10
    trainer.train(output_dir, train_with_dev=False,
                  max_epochs=nb_epochs)  # 150
def load_flair(mode = 'flair'):
    if mode == 'flair':
        stacked_embeddings = StackedEmbeddings([
            WordEmbeddings('glove'),
            PooledFlairEmbeddings('news-forward', pooling='min'),
            PooledFlairEmbeddings('news-backward', pooling='min')
        ])
    else:##bert
        stacked_embeddings = BertEmbeddings('bert-base-uncased')  ##concat last 4 layers give the best
    return stacked_embeddings
Beispiel #4
0
    def __init__(self, hidden_dim: int, rnn_type: str, vocab_size: int,
                 tagset_size: int, task_type: str):
        super(TaskLearner, self).__init__()

        self.task_type = task_type
        self.rnn_type = rnn_type
        self.bidirectional = True
        self.num_layers = 2
        self.num_directions = 2 if self.bidirectional else 1

        # Word Embeddings (TODO: Implement pre-trained word embeddings)

        # self.word_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim) # TODO: Implement padding_idx=self.pad_idx

        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            PooledFlairEmbeddings('news-forward', pooling='min'),
            PooledFlairEmbeddings('news-backward', pooling='min')
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
        self.embeddings = embeddings
        self.embedding_dim: int = self.embeddings.embedding_length

        if self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type == 'lstm':
            rnn = nn.LSTM
        elif self.rnn_type == 'rnn':
            rnn = nn.RNN
        else:
            raise ValueError

        # Sequence tagger
        self.rnn = rnn(input_size=self.embedding_dim,
                       hidden_size=hidden_dim,
                       num_layers=self.num_layers,
                       dropout=0.0 if self.num_layers == 1 else 0.5,
                       bidirectional=self.bidirectional,
                       batch_first=True)

        if self.task_type == 'SEQ':
            # Linear layer that maps hidden state space from rnn to tag space
            self.hidden2tag = nn.Linear(in_features=hidden_dim *
                                        self.num_directions,
                                        out_features=tagset_size)

        if self.task_type == 'CLF':
            # COME BACK LATER...
            self.drop = nn.Dropout(p=0.5)
            self.hidden2tag = nn.Linear(in_features=hidden_dim *
                                        self.num_directions,
                                        out_features=1)
    def train_flair(self):

        # Flair Model Initialisation and Training
        # # 1. get the corpus
        # corpus: Corpus = ColumnCorpus(os.path.join(os.getcwd(), 'results', '10'),
        #                               {0: 'text', 1: 'ner'},
        #                               train_file='train.txt',
        #                               test_file='test.txt',
        #                               dev_file='valid.txt',
        #                               column_delimiter=' ')

        corpus: Corpus = CONLL_03(
            base_path=os.path.join(os.getcwd(), 'results', '10'))

        corpus.dev_file = 'valid.txt'  # rather than 'dev.txt'

        # 2. what tag do we want to predict?
        tag_type = 'ner'
        # 3. make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        # initialize embeddings
        embedding_types: List[TokenEmbeddings] = [
            # GloVe embeddings
            WordEmbeddings('glove'),
            # contextual string embeddings, forward
            PooledFlairEmbeddings('news-forward', pooling='min'),
            # contextual string embeddings, backward
            PooledFlairEmbeddings('news-backward', pooling='min'),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        # initialize sequence tagger
        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type)

        # initialize trainer
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        results = trainer.train(os.path.join(os.getcwd(), 'results', '10',
                                             'tagger'),
                                train_with_dev=False,
                                max_epochs=50)

        print(results)
Beispiel #6
0
 def post_init(self):
     import flair
     flair.device = self.device
     from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \
         DocumentPoolEmbeddings
     embeddings_list = []
     for e in self.embeddings:
         model_name, model_id = e.split(':', maxsplit=1)
         emb = None
         try:
             if model_name == 'flair':
                 emb = FlairEmbeddings(model_id)
             elif model_name == 'pooledflair':
                 emb = PooledFlairEmbeddings(model_id)
             elif model_name == 'word':
                 emb = WordEmbeddings(model_id)
             elif model_name == 'byte-pair':
                 emb = BytePairEmbeddings(model_id)
         except ValueError:
             self.logger.error(f'embedding not found: {e}')
             continue
         if emb is not None:
             embeddings_list.append(emb)
     if embeddings_list:
         self.model = DocumentPoolEmbeddings(embeddings_list,
                                             pooling=self.pooling_strategy)
         self.logger.info(
             f'flair encoder initialized with embeddings: {self.embeddings}'
         )
     else:
         self.logger.error('flair encoder initialization failed.')
Beispiel #7
0
    def post_init(self):
        from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \
            DocumentPoolEmbeddings

        if self.model is not None:
            return
        embeddings_list = []
        for e in self.embeddings:
            model_name, model_id = e.split(':', maxsplit=1)
            emb = None
            try:
                if model_name == 'flair':
                    emb = FlairEmbeddings(model_id)
                elif model_name == 'pooledflair':
                    emb = PooledFlairEmbeddings(model_id)
                elif model_name == 'word':
                    emb = WordEmbeddings(model_id)
                elif model_name == 'byte-pair':
                    emb = BytePairEmbeddings(model_id)
            except ValueError:
                self.logger.error('embedding not found: {}'.format(e))
                continue
            if emb is not None:
                embeddings_list.append(emb)
        if embeddings_list:
            self.model = DocumentPoolEmbeddings(embeddings_list,
                                                pooling=self.pooling_strategy)
            self.logger.info(
                'initialize flair encoder with embeddings: {}'.format(
                    self.embeddings))
Beispiel #8
0
 def _train(
     self,
     output_dir: Union[str, Path],
     corpus: Optional[ColumnCorpus] = None,
     tagger: Optional[SequenceTagger] = None,
     hidden_size: int = 256,
     learning_rate: float = 0.1,
     mini_batch_size: int = 32,
     max_epochs: int = 100,
     use_crf: bool = True,
 ) -> SequenceTagger:
     tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
     if not tagger:
         tagger = SequenceTagger(
             hidden_size=hidden_size,
             embeddings=PooledFlairEmbeddings("news-forward"),
             tag_dictionary=tag_dictionary,
             tag_type="ner",
             use_crf=use_crf,
         )
     trainer = ModelTrainer(tagger, corpus)
     trainer.train(
         output_dir,
         learning_rate=learning_rate,
         mini_batch_size=mini_batch_size,
         max_epochs=max_epochs,
     )
     model_path = Path(output_dir, "best-model.pt")
     return SequenceTagger.load(model_path)
Beispiel #9
0
def use_flair_to_extract_context_embeddings(file, dest_folder, embedding_type, embedding_size, pretrained_model=None):
    if embedding_type.lower() == 'elmo':
        context_embedding = ELMoEmbeddings(model='pubmed')
    elif embedding_type.lower() == 'elmo_transformer':
        context_embedding = ELMoTransformerEmbeddings()
    elif embedding_type.lower() == 'flair':
        context_embedding = PooledFlairEmbeddings()
    elif embedding_type.lower() == 'bioflair':
        flair_1 = PooledFlairEmbeddings('pubmed-forward')
        flair_2 = PooledFlairEmbeddings('pubmed-backward')
        elmo = ELMoEmbeddings(model='pubmed')
        #bert = BertEmbeddings(bert_model_or_path='bert-base-multilingual-cased', layers='-1')
        context_embedding = StackedEmbeddings(embeddings=[flair_1, flair_2, elmo])
    elif embedding_type.lower() == 'biobert' or embedding_type.lower() == 'bert':
        context_embedding = BertEmbeddings(bert_model_or_path=pretrained_model, layers='-1')

    data = {}
    dest_name = os.path.basename(file).split('.')

    print(dest_folder)
    with open(file, 'r') as f, open('{}/{}.pickle'.format(dest_folder, dest_name[0]), 'wb') as d:
        sentence = ''
        instance = []
        j = 0
        for i in f.readlines():
            if i != '\n':
                i = i.split()
                sentence += ' '+i[0]
            elif i == '\n':
                sent = Sentence(sentence.strip())
                context_embedding.embed(sent)
                v = ''
                for i in sent:
                    instance.append((i.text, i.embedding[:embedding_size]))
                sentence = ''

                if instance:
                    data[j] = list(zip(*(instance.copy())))
                    j += 1
                instance.clear()
        pickle.dump(data, d)
        f.close()
        d.close()
Beispiel #10
0
def get_embeddings(pooling_op='min'):
    return StackedEmbeddings(embeddings=[
        # pre-trained embeddings
        PooledFlairEmbeddings(
            'es-forward',
            pooling=pooling_op,
        ),
        PooledFlairEmbeddings(
            'es-backward',
            pooling=pooling_op,
        ),
        BytePairEmbeddings(
            language='es',
            dim=300,
        ),

        # self-trained embeddings
        SpanishHealthCorpusEmbeddings('wang2vec'),
        # SpanishHealthCorpusEmbeddings('fastText'),
    ])
if args.task == 'ner':
    flair_corpus = NLPTask.CONLL_03
    tag_type = 'ner'
    embedding_types = [WordEmbeddings('glove')]
else:
    flair_corpus = NLPTask.CONLL_2000
    tag_type = 'np'
    embedding_types = [WordEmbeddings('extvec')]

_base_path = 'resources/taggers/{}-{}'.format(args.task, args.model_path)

if args.use_flair_embeddings is True:
    embedding_types.extend([
        # contextual string embeddings, forward
        PooledFlairEmbeddings('news-forward'),
        # contextual string embeddings, backward
        PooledFlairEmbeddings('news-backward')
    ])

if args.model_path != 'NA':
    embedding_types.append(EyeTrackingFeatureEmbedding(args.model_path))

corpus = NLPTaskDataFetcher.load_corpus(flair_corpus,
                                        base_path=TASK_DATASET_DIR)
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
embeddings = StackedEmbeddings(embeddings=embedding_types)
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=tag_type)
Beispiel #12
0
search_space = SearchSpace()

#Create or embedding stacks
#Flair recommends adding GLoVe to their character-level embeddings

flair_normal = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('mix-forward'),
    FlairEmbeddings('mix-backward')
])

bert = BertEmbeddings()
elmo = ELMoEmbeddings('original')
flair_pooled = StackedEmbeddings([
    WordEmbeddings('glove'),
    PooledFlairEmbeddings('mix-forward'),
    PooledFlairEmbeddings('mix-backward')
])

search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[bert, elmo, flair_normal, flair_pooled])

#other hyperparams are kept fixed for this excercise.
#Add to the lists to add to grid
#unfortunately for small grids, Flair picks random search instead of true
#grid search

search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[384])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1])
search_space.add(Parameter.DROPOUT, hp.choice, options=[0.0])
Beispiel #13
0
# this is the folder in which train, test and dev files reside
data_folder = 'data/ner/bc5dr'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder,
                              columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings, ELMoEmbeddings
from typing import List
embedding_types: List[TokenEmbeddings] = [
    PooledFlairEmbeddings('pubmed-forward'),
    PooledFlairEmbeddings('pubmed-backward'),
    ELMoEmbeddings('pubmed'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)
#  initialize trainer
from flair.trainers import ModelTrainer
Beispiel #14
0
def train(model, selected_embeddings):
  # 1. get the corpus
  if model == 'AMT':
    corpus = read_in_AMT()
  elif model == 'CADEC':
    corpus = read_in_CADEC()
  elif model == 'TwitterADR':
    corpus = read_in_TwitterADR()
  elif model == 'Micromed':
    corpus = read_in_Micromed()
  print(corpus)

  # 2. what tag do we want to predict?
  tag_type = 'ner'

  # 3. make the tag dictionary from the corpus
  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
  print(tag_dictionary.idx2item)


  embedding_types: List[TokenEmbeddings] = [
  ]

  if selected_embeddings['glove']:
    embedding_types.append(WordEmbeddings('glove'))

  if selected_embeddings['twitter']:
    embedding_types.append(WordEmbeddings('twitter'))

  if selected_embeddings['char']:
    embedding_types.append(CharacterEmbeddings())

  # FlairEmbeddings
  if selected_embeddings['flair']:
    embedding_types.append(FlairEmbeddings('news-forward'))

  # sFlairEmbeddings
  if selected_embeddings['flair']:
    embedding_types.append(FlairEmbeddings('news-backward'))

  # PooledFlairEmbeddings
  if selected_embeddings['pooled-flair']:
    embedding_types.append(PooledFlairEmbeddings('news-forward', pooling='mean'))

  # PooledFlairEmbeddings
  if selected_embeddings['pooled-flair']:
    embedding_types.append(PooledFlairEmbeddings('news-backward', pooling='mean'))

  # init BERT
  if selected_embeddings['bert']:
    embedding_types.append(BertEmbeddings())

  # init roberta
  if selected_embeddings['roberta']:
    embedding_types.append(RoBERTaEmbeddings())

    # init  BioBERT
  if selected_embeddings['biobert']:
    embedding_types.append(BertEmbeddings("data/embeddings/biobert-pubmed-pmc-cased"))

  # init clinical BERT
  if selected_embeddings['clinicalbiobert']:
    embedding_types.append(BertEmbeddings("data/embeddings/pretrained_bert_tf/biobert-base-clinical-cased"))


  # init multilingual ELMo
  if selected_embeddings['elmo']:
    embedding_types.append(ELMoEmbeddings())



  embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)



  tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                          embeddings=embeddings,
                                          tag_dictionary=tag_dictionary,
                                          tag_type=tag_type,
                                          use_crf=True
                                          )



  trainer: ModelTrainer = ModelTrainer(tagger, corpus)

  selected_embeddings_text = [key  for key in selected_embeddings if selected_embeddings[key]]
  selected_embeddings_text = '_'.join(selected_embeddings_text)

  model_dir = 'resources/taggers/FA_' + model + selected_embeddings_text

  # 7. start training
  trainer.train(model_dir,
                train_with_dev=True,
                learning_rate=0.1,
                mini_batch_size=4,
                max_epochs=200,
                checkpoint=True)

  # 8. plot training curves (optional)
  from flair.visual.training_curves import Plotter
  plotter = Plotter()
  plotter.plot_training_curves(model_dir + '/loss.tsv')
  plotter.plot_weights(model_dir + '/weights.txt')
Beispiel #15
0
    def read_dataset(self, file_dict, dataset_name, *args, **kwargs):
        """
        :param file_dict: Will have just one key:value
                          file_dict['base_path'] = <base_path>
                          base_path will have the path to the directory that will have the structure :
                          conll_03 directory
                             conll_03/eng.testa
                             conll_03/eng.testb
                             conll_03/eng.train
                         onto-ner directory
                            onto-ner/eng.testa
                            onto-ner/eng.testb
                            onto-ner/eng.train
        :param dataset_name: Could be one of the constants from NLPTask class(only NLPTask.CONLL_03 and
                                                                              NLPTask.ONTONER are used)
        :param args:
        :param kwargs:
        :return:
        """

        base_path = file_dict['base_path']
        self.dataset = dataset_name

        # 1. get the corpus
        corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(dataset_name, base_path)

        # 2. what tag do we want to predict?
        tag_type = 'ner'

        # 3. make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        if dataset_name == NLPTask.CONLL_03:
            # initialize embeddings
            embedding_types: List[TokenEmbeddings] = [

                # GloVe embeddings
                WordEmbeddings('glove'),

                # contextual string embeddings, forward
                PooledFlairEmbeddings('news-forward', pooling='min'),

                # contextual string embeddings, backward
                PooledFlairEmbeddings('news-backward', pooling='min'),
            ]

        elif dataset_name == NLPTask.ONTONER:
            # initialize embeddings
            embedding_types: List[TokenEmbeddings] = [
                WordEmbeddings('crawl'),
                FlairEmbeddings('news-forward'),
                FlairEmbeddings('news-backward'),
            ]

        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

        # initialize sequence tagger
        from flair.models import SequenceTagger

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type)

        self.corpus = corpus
        self.embeddings = embeddings
        self.tag_dictionary = tag_dictionary
        self.embedding_types = embedding_types
        self.tagger = tagger
Beispiel #16
0
# -*- coding: utf-8 -*-
from flair.datasets import CONLL_03
from flair.embeddings import PooledFlairEmbeddings, StackedEmbeddings, WordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

corpus = CONLL_03(base_path="data/conll-2003")

embedding_types = [
    WordEmbeddings("glove"),
    PooledFlairEmbeddings("news-forward", pooling="min"),
    PooledFlairEmbeddings("news-backward", pooling="min"),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=corpus.make_tag_dictionary(tag_type="ner"),
    tag_type="ner",
)


trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train("models/checkpoints", train_with_dev=True, max_epochs=150)
Beispiel #17
0
def use_flair_to_extract_context_embeddings(files,
                                            file_name,
                                            dest_folder,
                                            layer,
                                            embedding_type,
                                            embedding_size,
                                            pretrained_model=None):
    if embedding_type.lower() == 'elmo':
        context_embedding = ELMoEmbeddings(model='pubmed')
    elif embedding_type.lower() == 'elmo_transformer':
        context_embedding = ELMoTransformerEmbeddings()
    elif embedding_type.lower() == 'flair':
        context_embedding = PooledFlairEmbeddings()
    elif embedding_type.lower() == 'bioflair':
        flair_1 = PooledFlairEmbeddings('pubmed-forward')
        flair_2 = PooledFlairEmbeddings('pubmed-backward')
        elmo = ELMoEmbeddings(model='pubmed')
        context_embedding = StackedEmbeddings(
            embeddings=[flair_1, flair_2, elmo])
    elif embedding_type.lower() == 'biobert' or embedding_type.lower(
    ) == 'bert':
        context_embedding = TransformerWordEmbeddings(pretrained_model,
                                                      layers=layer)

    data = []
    for i in files:
        open_f = open(i, 'r')
        data += open_f.readlines()
        open_f.close()

    with open('{}/{}1.pickle'.format(dest_folder, file_name),
              'wb') as store, open(
                  '{}/ebm_comet_multilabels_p1.txt'.format(dest_folder),
                  'w') as file:
        #sentence = ''
        sentence = []
        multi_labels = ''
        instance = []
        label_representations = {}
        #fetch outcome phrase vector representations grouped in their respective outcome domain labels
        if file_name.lower() == 'ebm-comet':
            label_representations = ebm_comet_preprocessing(
                data=data,
                context_embedding=context_embedding,
                sentence=[],
                label_representations={},
                file=file)
        elif file_name.lower() == 'ebm-nlp':
            label_representations, domain_label_count = ebm_nlp_processing(
                data=data,
                context_embedding=context_embedding,
                sentence=[],
                label_representations={})

        label_centroids = {}
        print(label_representations.keys())
        print([i.shape for i in list(label_representations.values())])
        print(domain_label_count)
        #find the centroid of each group of outcome phrases vectors to represent each label
        for lab in label_representations:
            label_centroids[lab] = torch.mean(label_representations[lab], 0)
        pickle.dump(label_centroids, store)
        store.close
Beispiel #18
0
# corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # GloVe embeddings
    WordEmbeddings('glove'),

    # contextual string embeddings, forward
    PooledFlairEmbeddings('news-forward', pooling='min'),

    # contextual string embeddings, backward
    PooledFlairEmbeddings('news-backward', pooling='min'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type)

# initialize trainer
Beispiel #19
0
def resume(model1, selected_embeddings, model2):

  # 1. get the corpus
  if model2 == 'AMT':
    corpus = read_in_AMT()
  elif model2 == 'CADEC':
    corpus = read_in_CADEC()
  elif model2 == 'TwitterADR':
    corpus = read_in_TwitterADR()
  elif model2 == 'Micromed':
    corpus = read_in_Micromed()
  print(corpus)

  # 2. what tag do we want to predict?
  tag_type = 'ner'

  # 3. make the tag dictionary from the corpus
  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
  print(tag_dictionary.idx2item)


  embedding_types: List[TokenEmbeddings] = [
  ]

  if selected_embeddings['glove']:
    embedding_types.append(WordEmbeddings('glove'))

  if selected_embeddings['twitter']:
    embedding_types.append(WordEmbeddings('twitter'))

  if selected_embeddings['char']:
    embedding_types.append(CharacterEmbeddings())

  if selected_embeddings['flair']:
    embedding_types.append(FlairEmbeddings('news-forward'))

  if selected_embeddings['flair']:
    embedding_types.append(FlairEmbeddings('news-backward'))


  if selected_embeddings['pooled-flair']:
    embedding_types.append(PooledFlairEmbeddings('news-forward', pooling='mean'))

  if selected_embeddings['pooled-flair']:
    embedding_types.append(PooledFlairEmbeddings('news-backward', pooling='mean'))

  # init multilingual BERT
  if selected_embeddings['bert']:
    embedding_types.append(BertEmbeddings())


  # init multilingual ELMo
  if selected_embeddings['elmo']:
    embedding_types.append(ELMoEmbeddings())



  embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)


  # tagger: SequenceTagger = SequenceTagger(hidden_size=256,
  #                                         embeddings=embeddings,
  #                                         tag_dictionary=tag_dictionary,
  #                                         tag_type=tag_type,
  #                                         use_crf=True)

  selected_embeddings_text = [key  for key in selected_embeddings if selected_embeddings[key]]
  selected_embeddings_text = '_'.join(selected_embeddings_text)
  model_dir1 = 'resources/taggers/to_resume_CoNLL-03_' + model1 + selected_embeddings_text


  #checkpoint = tagger.load_checkpoint(Path(model_dir1+ '/checkpoint.pt'))
  #trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

  best_model = SequenceTagger.load(Path(model_dir1+ '/best-model.pt'))

  trainer: ModelTrainer = ModelTrainer(best_model, corpus)


  # resources/taggers/to_resume_CADECglove_char_flair/

  model_dir2 = 'resources/taggers/train_with_dev_from_' + model1 + '_to_' + model2 + selected_embeddings_text + '_fine-tuned7s'

  trainer.train(model_dir2,
                EvaluationMetric.MICRO_F1_SCORE,
                train_with_dev=True,
                learning_rate=0.1,
                mini_batch_size=8,
                max_epochs=150,
                checkpoint=True)