Exemple #1
0
def create_embeddings(params):
    embedding_type = params["embedding_type"]
    assert embedding_type in ["bert", "flair", "char"]
    if embedding_type == "bert":
        bert_embedding = BertEmbeddings(params["bert_model_dirpath_or_name"],
                                        pooling_operation="mean")

        embedding_types: List[TokenEmbeddings] = [bert_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "flair":
        glove_embedding = WordEmbeddings(
            '/opt/kanarya/glove/GLOVE/GloVe/vectors.gensim')
        word2vec_embedding = WordEmbeddings(
            '/opt/kanarya/huawei_w2v/vector.gensim')
        fast_text_embedding = WordEmbeddings('tr')
        char_embedding = CharacterEmbeddings()

        # bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32')
        embedding_types: List[TokenEmbeddings] = [
            fast_text_embedding, glove_embedding, word2vec_embedding,
            char_embedding
        ]
        # embedding_types: List[TokenEmbeddings] = [custom_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "char":
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=[CharacterEmbeddings()])
    else:
        embeddings = None

    return embeddings
def train():
    columns = {0: 'text', 1: 'pos'}
    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus('', columns,
                                  train_file=args.train,
                                  test_file=args.test,
                                  dev_file=args.dev)

    tag_dictionary = corpus.make_tag_dictionary(tag_type='pos')

    # initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        CharacterEmbeddings(),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='pos',
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(args.model,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)
Exemple #3
0
def train():
    # column format - word postag label
    columns = {0: "word", 1: "postag", 2: "ner"}
    data_folder = os.path.join(path, "../data/")

    # read train, dev and test set
    # here test set is same as dev set
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa")
    print(corpus)

    # create label dictionary
    tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner")
    print(tag_dictionary.idx2item)

    # using glove embeddings and character embeddings
    embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

    # create sequence tagger and trainer instance
    tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True)
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    model_path = os.path.join(path, "../models/")

    # commence training
    # model shall be saved in model_path under filename final-model.pt
    # this step takes at least 4 hours to complete, so please ensure access to GPU
    trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)
Exemple #4
0
    def create_embeddings(self) -> StackedEmbeddings:

        embedding_types: List[FlairEmbeddings] = []
        
        if self.config['use_word_embeddings']:
            embedding_types.append(W2vWordEmbeddings(self.config['word_embeddings_path']))

        if self.config['use_char_embeddings']:
            embedding_types.append(CharacterEmbeddings())


        if self.config['use_flair_embeddings']:
            embedding_types.append(FlairEmbeddings('es-clinical-forward'))
            embedding_types.append(FlairEmbeddings('es-clinical-backward'))
        
        if self.config['use_beto_embeddings']:
            embedding_types.append(
                TransformerWordEmbeddings(
                    'dccuchile/bert-base-spanish-wwm-cased',
                    layers = self.config['layers'], 
                    layer_mean = self.config['layer_mean'], 
                    subtoken_pooling = self.config['subtoken_pooling']))

        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)
        return embeddings
    def train(self, training_dir=None):
        from flair.trainers import ModelTrainer

        if training_dir is None:
            training_dir = flair_splitter_dep_dir

        # define columns
        columns = {0: "text", 1: "ner"}

        # this is the folder in which train, test and dev files reside
        data_folder = flair_splitter_dep_dir + "data"

        # init a corpus using column format, data folder and the names of the train, dev and test files
        # note that training data should be unescaped, i.e. tokens like "&", not "&"
        corpus: Corpus = ColumnCorpus(
            data_folder,
            columns,
            train_file="sent_train.txt",
            test_file="sent_test.txt",
            dev_file="sent_dev.txt",
            document_separator_token="-DOCSTART-",
        )

        print(corpus)

        tag_type = "ner"
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary)

        # initialize embeddings
        embedding_types = [
            # WordEmbeddings('glove'),
            # comment in this line to use character embeddings
            CharacterEmbeddings(),
            # comment in these lines to use flair embeddings
            #FlairEmbeddings("news-forward"),
            #FlairEmbeddings("news-backward"),
            # BertEmbeddings('distilbert-base-cased')
            TransformerWordEmbeddings('google/electra-base-discriminator')
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=128,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=tag_type,
            use_crf=True,
        )

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(training_dir,
                      learning_rate=0.1,
                      mini_batch_size=16,
                      max_epochs=50)
        self.model = tagger
    def build_embedding(self, lang, embedding_codes: List[str]) -> None:

        self.tic = time.time()
        self.embedding_name: str = "-".join(embedding_codes)
        self.lang = lang

        embedding_types: List[TokenEmbeddings] = []

        for code in embedding_codes:

            code = code.lower()
            assert code in [
                "bpe",
                "bert",
                "flair",
                "ft",
                "char",
                "ohe",
                "elmo",
            ], f"{code} - Invalid embedding code"

            if code == "ohe":
                embedding_types.append(OneHotEmbeddings(corpus=self.corpus))
            elif code == "ft":
                embedding_types.append(WordEmbeddings(self.lang))
            elif code == "bpe":
                embedding_types.append(BytePairEmbeddings(self.lang))
            elif code == "bert":
                embedding_types.append(
                    TransformerWordEmbeddings(
                        model=self.huggingface_ref[self.lang],
                        pooling_operation="first",
                        layers="-1",
                        fine_tune=False,
                    )
                )
            elif code == "char":
                embedding_types.append(CharacterEmbeddings())
            elif code == "flair":
                embedding_types.append(FlairEmbeddings(f"{self.lang}-forward"))
                embedding_types.append(FlairEmbeddings(f"{self.lang}-backward"))
            elif code == "elmo":
                embedding_types.append(
                    ELMoEmbeddings(model="large", embedding_mode="all")
                )

        self.embedding: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types
        )

        self.tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=self.embedding,
            tag_dictionary=self.tag_dictionary,
            tag_type=self.tag_type,
            use_crf=True,
        )

        self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)
Exemple #7
0
    def get_embeddings(self):
        embeddings = [
            PolyglotEmbeddings(self.args.lang),
            CharacterEmbeddings()
        ]
        if not self.args.lang in self.embeds_unsupported_langs:
            embeddings.append(WordEmbeddings(self.args.lang))

        return StackedEmbeddings(embeddings=embeddings)
Exemple #8
0
def create_embeddings(params):
    embedding_type = params["embedding_type"]
    assert embedding_type in ["bert", "flair", "char"]
    if embedding_type == "bert":
        bert_embedding = BertEmbeddings(params["bert_model_dirpath_or_name"],
                                        pooling_operation="first")
        if params[
                "bert_model_dirpath_or_name"] == "dbmdz/bert-case-turkish-cased":
            from transformers import AutoModel, AutoTokenizer
            bert_embedding.tokenizer = AutoTokenizer.from_pretrained(
                params["bert_model_dirpath_or_name"])
            bert_embedding.model = AutoModel.from_pretrained(
                params["bert_model_dirpath_or_name"])

        embedding_types: List[TokenEmbeddings] = [bert_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "flair":
        glove_embedding = WordEmbeddings(
            '/opt/kanarya/glove/GLOVE/GloVe/vectors.gensim')
        word2vec_embedding = WordEmbeddings(
            '/opt/kanarya/huawei_w2v/vector.gensim')
        fast_text_embedding = WordEmbeddings('tr')
        char_embedding = CharacterEmbeddings()

        # bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32')
        embedding_types: List[TokenEmbeddings] = [
            fast_text_embedding, glove_embedding, word2vec_embedding,
            char_embedding
        ]
        # embedding_types: List[TokenEmbeddings] = [custom_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "char":
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=[CharacterEmbeddings()])
    else:
        embeddings = None

    return embeddings
Exemple #9
0
    def fit(self, X, y):
        """ Build feature vectors and train FLAIR model.

            Parameters
            ----------
            X : list(list(str))
                list of sentences. Sentences are tokenized into list 
                of words.
            y : list(list(str))
                list of list of BIO tags.

            Returns
            -------
            self
        """
        log.info("Creating FLAIR corpus...")
        Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1)
        sents_train = self._convert_to_flair(Xtrain, ytrain)
        sents_val = self._convert_to_flair(Xval, yval)
        corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus")

        tag_dict = corpus_train.make_tag_dictionary(tag_type="ner")

        if self.embeddings is None:
            embedding_types = [
                WordEmbeddings("glove"),
                CharacterEmbeddings()    
            ]
            self.embeddings = StackedEmbeddings(embeddings=embedding_types)

        log.info("Building FLAIR NER...")
        self.model_ = SequenceTagger(hidden_size=self.hidden_dim,
            embeddings=self.embeddings,
            tag_dictionary=tag_dict,
            tag_type="ner",
            use_crf=self.use_crf,
            use_rnn=self.use_rnn,
            rnn_layers=self.num_rnn_layers,
            dropout=self.dropout,
            word_dropout=self.word_dropout,
            locked_dropout=self.locked_dropout)

        log.info("Training FLAIR NER...")
        opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam
        trainer = ModelTrainer(self.model_, corpus_train, opt)
        trainer.train(base_path=self.basedir,
            learning_rate=self.learning_rate,
            mini_batch_size=self.batch_size,
            max_epochs=self.max_iter)

        return self
Exemple #10
0
def embed_tweet(tweetList):
    # initialize the word embeddings
    tr_embedding = WordEmbeddings('tr')
    char_embedding = CharacterEmbeddings()

    # initialize the document embeddings, mode = mean
    document_embeddings = DocumentPoolEmbeddings(
        [tr_embedding, char_embedding])

    tweetTensors = []
    for tweet in tweetList:
        #print(norm_tweet(tweet))
        sentence = Sentence(norm_tweet(tweet))
        document_embeddings.embed(sentence)
        tweetTensors.append(sentence.get_embedding().data)
    return tweetTensors
Exemple #11
0
def get_embeddings(embeddings: List[str], character: bool, lang: str,
                   bpe_size: int) -> StackedEmbeddings:
    """To Construct and return a embedding model"""
    stack = []
    for e in embeddings:
        if e != '':
            if 'forward' in e or 'backward' in e:
                stack.append(FlairEmbeddings(e))
            else:
                stack.append(WordEmbeddings(e))
    if character:
        stack.append(CharacterEmbeddings())
    if bpe_size > 0:
        stack.append(BytePairEmbeddings(language=lang, dim=bpe_size))

    return StackedEmbeddings(embeddings=stack)
Exemple #12
0
 def __init__(self, config):
     """
     Load pretrained language model
     """
     super(LanguageModel, self).__init__()
     embeddings_stack = []
     transformers = config.get("language_model", "transformers")
     if transformers is not "":
         transformers = transformers.split(";")
         for model in transformers:
             embeddings_stack.append(
                 TransformerWordEmbeddings(
                     model,
                     layers="-1",
                     pooling_operation='mean',
                     # use_scalar_mix=True,
                     fine_tune=True))
     word_embeddings = config.get("language_model", "word_embeddings")
     if word_embeddings is not "":
         word_embeddings = word_embeddings.split(";")
         for model in word_embeddings:
             embeddings_stack.append(WordEmbeddings(model))
     flair_embeddings = config.get("language_model", "flair_embeddings")
     if flair_embeddings is not "":
         flair_embeddings = flair_embeddings.split(";")
         for model in flair_embeddings:
             embeddings_stack.append(FlairEmbeddings(model, fine_tune=True))
     character_embeddings = config.get("language_model",
                                       "character_embeddigs")
     if character_embeddings.lower() is "yes":
         embeddings_stack.append(CharacterEmbeddings(character_embeddings))
     bytepair_embeddings = config.get("language_model",
                                      "bytepair_embeddings")
     if bytepair_embeddings.lower() is "yes":
         embeddings_stack.append(BytePairEmbeddings())
     custom_embeddings = config.get("language_model", "custom_embeddings")
     if custom_embeddings is not "":
         custom_embeddings = custom_embeddings.split(";")
         for path in custom_embeddings:
             embeddings_stack.append(WordEmbeddings(path))
     self.lm = StackedEmbeddings(embeddings_stack)
     self.embedding_dim = self.lm.embedding_length
     self.dropout = torch.nn.Dropout(
         float(config.get("language_model", "dropout")))
     self.classify = torch.nn.Linear(self.embedding_dim, 2)
     if config.get("language_model", "relu") == "yes":
         self.relu = torch.nn.ReLU()
Exemple #13
0
def train(data_dir: str, model_dir: str, dataset_format: str='macss', num_filters: int=150,
          word_embeddings: str='de-fasttext', offset_embedding_dim: int=50, learning_rate: float=.1,
          batch_size: int=32, max_epochs: int=50, dropout: float=.5, use_char_embeddings: bool=False,
          seed: int=0, dev_size: float=.1, test_size: float=.2):

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(message)s',
        datefmt='%d-%b-%y %H:%M:%S')

    logging.info(f'Training config: {locals().items()}')

    if dataset_format not in ['macss', 'semeval']:
        raise ValueError(f"Dataset format '{dataset_format}' not supported.")

    corpus: TaggedCorpus = dataset_loader[dataset_format](data_dir, dev_size, seed)
    label_dictionary = corpus.make_label_dictionary()

    logging.info(f'Corpus: {corpus}')
    corpus.print_statistics()

    logging.info(f'Size of label dictionary: {len(label_dictionary)}')
    logging.info(f'Labels: {label_dictionary.get_items()}')

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings(word_embeddings),
        RelativeOffsetEmbeddings('offset_e1', max_len=200, embedding_dim=offset_embedding_dim),
        RelativeOffsetEmbeddings('offset_e2', max_len=200, embedding_dim=offset_embedding_dim),
    ]

    if use_char_embeddings:
        embedding_types += CharacterEmbeddings()

    document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings(embedding_types,
                                                                       num_filters=num_filters,
                                                                       dropout=dropout)

    classifier: TextClassifier = TextClassifier(document_embeddings=document_embeddings,
                                                label_dictionary=label_dictionary,
                                                multi_label=False)

    trainer: TextClassifierTrainer = TextClassifierTrainer(classifier, corpus, label_dictionary)

    trainer.train(model_dir,
                  learning_rate=learning_rate,
                  mini_batch_size=batch_size,
                  max_epochs=max_epochs)
Exemple #14
0
def main():
    params, config = parse_arguments()
    print(config)
    print(params)
    print("Constructing data loaders...")

    myvlbert = ResNetVLBERT(config)
    pre_model = BertRel(params, myvlbert)

    dl = DataLoader(params)
    dlbb = DLbb(params)
    evaluator = Evaluator(params, dl)
    print("Constructing data loaders...[OK]")

    if params.mode == 0:
        print("Training...")
        t = Trainer(params, config, dl, dlbb, evaluator, pre_model)
        t.train()
        print("Training...[OK]")
    elif params.mode == 1:
        print("Loading rpbert...")
        embedding_types = [
            WordEmbeddings(
                '/media/iot538/a73dbfc5-a8a0-4021-a841-3b7d7f3fd964/mnt/xj/wnut17_advanced/pretrain/en-fasttext-crawl-300d-1M'
            ),
            CharacterEmbeddings(
                '/home/iot538/.flair/datasets/common_characters_large'),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
        model = MNER(params, embeddings, pre_model)
        model_file_path = params.model_file_name
        model.load_state_dict(torch.load(model_file_path))
        if torch.cuda.is_available():
            model = model.cuda()
        print("Loading rpbert...[OK]")

        print("Evaluating rpbert on test set...")
        with torch.no_grad():
            acc, f1, prec, rec = evaluator.get_accuracy(model, 'test')
        print("Accuracy : {}".format(acc))
        print("F1 : {}".format(f1))
        print("Precision : {}".format(prec))
        print("Recall : {}".format(rec))
        print("Evaluating rpbert on test set...[OK]")
Exemple #15
0
def EmbeddingFactory(parameters, corpus):
    from flair.embeddings import FlairEmbeddings, StackedEmbeddings, \
        WordEmbeddings, OneHotEmbeddings, CharacterEmbeddings, TransformerWordEmbeddings

    stack = []
    for emb in parameters.embedding.split():
        if any((spec in emb) for spec in ("bert", "gpt", "xlnet")):
            stack.append(
                TransformerWordEmbeddings(model=pretrainedstr(
                    emb, parameters.language),
                                          fine_tune=parameters.tune_embedding))
        elif emb == "flair":
            stack += [
                FlairEmbeddings(f"{parameters.language}-forward",
                                fine_tune=parameters.tune_embedding),
                FlairEmbeddings(f"{parameters.language}-backward",
                                fine_tune=parameters.tune_embedding)
            ]
        elif emb == "pos":
            stack.append(
                OneHotEmbeddings(corpus,
                                 field="pos",
                                 embedding_length=parameters.pos_embedding_dim,
                                 min_freq=1))
        elif emb == "fasttext":
            stack.append(WordEmbeddings(parameters.language))
        elif emb == "word":
            stack.append(
                OneHotEmbeddings(
                    corpus,
                    field="text",
                    embedding_length=parameters.word_embedding_dim,
                    min_freq=parameters.word_minfreq))
        elif emb == "char":
            stack.append(
                CharacterEmbeddings(
                    char_embedding_dim=parameters.char_embedding_dim,
                    hidden_size_char=parameters.char_bilstm_dim))
        else:
            raise NotImplementedError()
    return StackedEmbeddings(stack)
def get_flair_vectors(vocab):
    print("Looking for flair vectors")
    #import flair embeddings!
    #we can change here to use different embeddings
    glove_embedding = WordEmbeddings('glove')
    twitter_embedding = WordEmbeddings('en-twitter')
    character_embeddings = CharacterEmbeddings()
    stacked_embeddings = StackedEmbeddings(embeddings=[glove_embedding, character_embeddings,twitter_embedding])

    flair_vectors = {}
    found = 0
    for word in vocab:
        wt=Sentence(word)
        stacked_embeddings.embed(wt)
        vector=wt[0].embedding.detach().numpy()
        #if the word is not in the embedding dict, the vector will be all zero
        if np.sum(np.abs(vector))>0:
            flair_vectors[word]=vector
            found += 1
    print('\n')
    print('Found %d words in GLOVE' % found)
    return flair_vectors
Exemple #17
0
    def __init__(self, device="cpu"):
        super(RankNetWithEmbeddings, self).__init__()

        self._device = device

        fasttext_embedding = WordEmbeddings('en-news')
        # flair_embedding_forward = FlairEmbeddings('news-forward')
        # flair_embedding_backward = FlairEmbeddings('news-backward')
        byte_pair_embedding = BytePairEmbeddings('en')
        glove_embeddings = WordEmbeddings('glove')
        character_embedding = CharacterEmbeddings()

        self._mention_embedding = DocumentPoolEmbeddings([fasttext_embedding])
        self._label_embedding = DocumentPoolEmbeddings([
            fasttext_embedding,
        ])
        self._context_embedding = DocumentPoolEmbeddings([fasttext_embedding])
        self._description_embedding = DocumentPoolEmbeddings([
            fasttext_embedding,
        ])

        input_length =   self._mention_embedding.embedding_length \
                       + self._context_embedding.embedding_length \
                       + self._label_embedding.embedding_length   \
                       + self._description_embedding.embedding_length

        self.model = nn.Sequential(
            nn.Linear(input_length, 256),
            nn.ReLU(),
            # nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            # nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Tanh(),
        )

        self.output_sig = nn.Sigmoid()
        self.to(device)
Exemple #18
0
def hyper_opt(corpus):
    print("hyper_opt is started")
    # define your search space
    search_space = SearchSpace()

    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([
                             WordEmbeddings('en'),
                             WordEmbeddings('glove'),
                             CharacterEmbeddings(),
                             FlairEmbeddings('news-forward'),
                             FlairEmbeddings('news-backward'),
                             ELMoEmbeddings()
                         ])
                     ])

    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256])
    #search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.1])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[32, 64])

    # create the parameter selector
    param_selector = SequenceTaggerParamSelector(
        corpus,
        'ner',
        #'/content/gdrive/My Drive/resume_ner_data/hyperparam_selection',
        model_path,
        max_epochs=50,
        training_runs=2,
        optimization_value=OptimizationValue.DEV_SCORE)

    # start the optimization
    param_selector.optimize(search_space, max_evals=100)
from flair.models import SequenceTagger
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import TransformerWordEmbeddings
from flair.embeddings import DocumentPoolEmbeddings
from flair.data import Sentence

# Flair library supports compination of various word embeddings generated by various base models
# constructing all the models once and not loading a new one at every function call; saves a lot of ram; faster
# we have acces to all the transformers present in the 'hugging face' library
bert_embedding = TransformerWordEmbeddings('bert-base-cased')
roberta_embedding = TransformerWordEmbeddings('roberta-base')
glove_embedding = WordEmbeddings('glove')
character_embeddings = CharacterEmbeddings()
flair_forward = FlairEmbeddings('news-forward-fast')
flair_backward = FlairEmbeddings('news-backward-fast')


def vectorize(string: str = None, selected_base_models: list = None):
    # 'vectorizes' the input string using one or a combination of word embeddings - if 'vector representation'
    # is being selected at Algorithms construction time.
    """
    :param string, input string
    :param selected_base_models list of the models we want to use in order to create word embeddings
    :return: embedding
    """

    if not selected_base_models:
        raise SystemExit(f"[ERROR]: function {vectorize.__name__}() -> Provide at least one base model: ['bert',"
                         f"'roberta', 'glove', 'character', 'flair_forward', 'flair_backward']")
Exemple #20
0
def train(data_dir: str,
          model_dir: str,
          dataset_format: str = 'macss_tdt',
          num_filters: int = 150,
          word_embeddings: str = 'de-fasttext',
          offset_embedding_dim: int = 100,
          learning_rate: float = .1,
          batch_size: int = 32,
          max_epochs: int = 1,
          dropout: float = .5,
          use_char_embeddings: bool = False,
          seed: int = 0,
          dev_size: float = .1,
          test_size: float = .2,
          concept_embedding_dim: int = 100):

    all_data = open('all_data.txt', encoding='utf8').read().split("\n")
    test_dev_percent = math.floor((len(all_data) * 25) / 100)
    k_folds = math.floor(len(all_data) / test_dev_percent)
    random.shuffle(all_data)
    config_name = '1_Some_Setting_Name'

    for i in range(k_folds):
        data_path = 'resources/' + config_name + '/' + str(i + 1)
        test_dev_set = all_data[(test_dev_percent * (i + 1)) -
                                test_dev_percent:test_dev_percent * (i + 1)]
        train = all_data[0:(test_dev_percent * (i + 1)) -
                         test_dev_percent] + all_data[test_dev_percent *
                                                      (i + 1):len(all_data)]
        random.shuffle(test_dev_set)
        test_perc = math.floor((len(test_dev_set) * 60) / 100)
        test = test_dev_set[0:test_perc]
        dev = test_dev_set[test_perc:len(test_dev_set)]
        os.makedirs(data_path, exist_ok=True)
        train_txt = open(data_path + '/train.txt', 'w+')
        test_txt = open(data_path + '/test.txt', 'w+')
        dev_txt = open(data_path + '/dev.txt', 'w+')
        os.system('cp -r ./Data/vocabulary/ ' + data_path)
        train_txt.write('\n'.join(train))
        test_txt.write('\n'.join(test))
        dev_txt.write('\n'.join(dev))

        train_txt.close()
        test_txt.close()
        dev_txt.close()

        #print("Train Directory: ", data_dir, dev_size, seed, "\n")

        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s - %(message)s',
                            datefmt='%d-%b-%y %H:%M:%S')

        if dataset_format not in ['macss_tdt']:
            raise ValueError(
                f"Dataset format '{dataset_format}' not supported.")

        corpus: TaggedCorpus = dataset_loader[dataset_format](data_path,
                                                              'train.txt',
                                                              'dev.txt',
                                                              'test.txt')
        label_dictionary = corpus.make_label_dictionary()  # rel-type

        # Comment out the embeddings that you don't need
        embedding_types: List[TokenEmbeddings] = [
            # mEx Fine-Tuned Word Embeddings
            #WordEmbeddings('../../Resources/mex-ft-wiki-de-finetuned-biomedical.gensim'),

            # Default German FastText Word Embeddings
            #WordEmbeddings('../../Resources/ft-wiki-de.gensim'),

            # Relative Offset Embeddings
            RelativeOffsetEmbeddings('offset_e1',
                                     max_len=200,
                                     embedding_dim=offset_embedding_dim),
            RelativeOffsetEmbeddings('offset_e2',
                                     max_len=200,
                                     embedding_dim=offset_embedding_dim),

            # Concept Embeddings
            ConceptEmbeddings('concept_1',
                              max_len=200,
                              embedding_dim=concept_embedding_dim),
            ConceptEmbeddings('concept_2',
                              max_len=200,
                              embedding_dim=concept_embedding_dim),
        ]

        if use_char_embeddings:
            embedding_types += CharacterEmbeddings()

        document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings(
            embedding_types, num_filters=num_filters, dropout=dropout)

        classifier: TextClassifier = TextClassifier(
            document_embeddings=document_embeddings,
            label_dictionary=label_dictionary,
            multi_label=False)

        trainer: TextClassifierTrainer = TextClassifierTrainer(
            classifier, corpus, label_dictionary)

        trainer.train(data_path,
                      learning_rate=learning_rate,
                      mini_batch_size=batch_size,
                      max_epochs=3,
                      use_tensorboard=False,
                      embeddings_in_memory=False)
Exemple #21
0
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size,
                                  skf_split_no):
    """
    Trains the sequence labeling model (by default model uses one RNN layer).
    Model is trained to predict part of speech tag and takes into account information about:
    - text (plain text made of tokens that together form a sentence),
    - occurrence of separator before token,
    - proposed tags for given token.
    It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded
    using a concatenation of three vector embeddings:
    - WordEmbeddings - classic word embeddings. That kind of embeddings are static and word-level, meaning that each
      distinct word gets exactly one pre-computed embedding. Here FastText embeddings trained over polish Wikipedia are
      used.
    - CharacterEmbeddings - allow to add character-level word embeddings during model training. These embeddings are
      randomly initialized when the class is being initialized, so they are not meaningful unless they are trained on
      a specific downstream task. For instance, the standard sequence labeling architecture used by Lample et al. (2016)
      is a combination of classic word embeddings with task-trained character features. Normally this would require to
      implement a hierarchical embedding architecture in which character-level embeddings for each word are computed
      using an RNN and then concatenated with word embeddings. In Flair, this is simplified by treating
      CharacterEmbeddings just like any other embedding class. To reproduce the Lample architecture, there is only
      a need to combine them with standard WordEmbeddings in an embedding stack.
    - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an
      embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also
      differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of
      the most common words seen in the corpus, plus an UNK token for all rare words.
      There are one One Hot Embeddings used in training: to embed information about proposed tags (concatenated
      with a ';') and appearance of separator before each token.
    Model training is based on stratified 10 fold cross validation split indicated by skf_split_no argument.
    Model and training logs are saved in resources_ex_3/taggers/example-pos/it-<skf_split_no> directory (where
    <skf_split_no> is the number of stratified 10 fold cross validation split used to train the model).

    :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize
    ColumnCorpus object
    :param proposed_tags_vocabulary_size: number of proposed tags
    :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10)
    used to train the model
    """
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'}
    # init a corpus using column format, data folder and the names of the train and test files
    # 1. get the corpus
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train_' + str(skf_split_no),
                                  test_file='test_' + str(skf_split_no),
                                  dev_file=None)
    log.info(corpus)
    # 2. what tag do we want to predict
    tag_type = 'pos'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    log.info(tag_dictionary)
    # 4. initialize embeddings
    local_model_path = use_scratch_dir_if_available(
        'resources/polish_FastText_embeddings')
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings(local_model_path)
        if os.path.exists(local_model_path) else WordEmbeddings('pl'),
        CharacterEmbeddings(
            use_scratch_dir_if_available('resources/polish_letters_dict')),
        OneHotEmbeddings(corpus=corpus,
                         field='is_separator',
                         embedding_length=3,
                         min_freq=3),
        OneHotEmbeddings(corpus=corpus,
                         field='proposed_tags',
                         embedding_length=math.ceil(
                             (proposed_tags_vocabulary_size + 1)**0.25),
                         min_freq=3)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    # 5. initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=False,
                                            rnn_layers=1)
    # 6. initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(
        use_scratch_dir_if_available('resources_ex_3/taggers/example-pos/it-' +
                                     str(skf_split_no)),
        learning_rate=0.1,
        mini_batch_size=32,
        embeddings_storage_mode='gpu',
        max_epochs=sys.maxsize,
        monitor_test=True)
    # 8. plot weight traces (optional)
    plotter = Plotter()
    plotter.plot_weights(
        use_scratch_dir_if_available('resources_ex_3/taggers/example-pos/it-' +
                                     str(skf_split_no) + '/weights.txt'))
Exemple #22
0
# Set up the Corpus
columns = {0: 'text', 1:'ner'}

data_folder = './data/IOBES'

corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
    StackedEmbeddings([ ELMoEmbeddings('original') ]),
    StackedEmbeddings([ ELMoEmbeddings('original'), CharacterEmbeddings() ])
])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25)
search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])


# initialise embeddings

param_selector = SequenceTaggerParamSelector(
    corpus,
    tag_type='ner',
Exemple #23
0
def train(
    review_category,
    params,
    update_model= False,
    learning_rate=0.01,
    embeddings_storage_mode='gpu',
    checkpoint= True,
    batch_growth_annealing= True,
    weight_decay = 1e-4,
    shuffle=True,
    train_with_dev=True,
    mini_batch_size=2,
    maxi_batch_size=128,
    anneal_factor=0.5,
    patience=2,
    max_epochs=150
    ):
    review_category = str(review_category)
    print('loading training corpus from %s'%(params.data_folder))
    corpus: Corpus = ClassificationCorpus(params.data_folder,
                train_file= review_category+'_train.txt',
                test_file= review_category+'_test.txt',
                dev_file= review_category+'_dev.txt')
    label_dict = corpus.make_label_dictionary()
    print('labels: ',label_dict)
    if eval(params.transformer):
        print('initializing transformer document embeddings using %s ...'%(params.transformer_pretrain_lm))
        # 3. initialize transformer document embeddings (many models are available)
        document_embeddings = TransformerDocumentEmbeddings(params.transformer_pretrain_lm, fine_tune=True)
    else:
        print('initializing document embeddings')
        word_embeddings= [
            WordEmbeddings('glove'),
            # comment in this line to use character embeddings
            CharacterEmbeddings(),
            # comment in these lines to use flair embeddings
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
            BertEmbeddings(),
            # TransformerXLEmbeddings(),
            #RoBERTaEmbeddings(),
            #XLNetEmbeddings()
        ]
        # Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
        document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                    hidden_size=512,
                                                    reproject_words=True,
                                                    reproject_words_dimension=256,
                                                    )
    if not update_model:
        print('building review_analysis classifier ...')
        # create the text classifier
        classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
        # initialize the text classifier trainer
        print("initializing review_analysis classifier's trainer")
        trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
    else:
        # continue trainer at later point
        checkpoint_path = params.checkpoint_dir+'/%s/checkpoint.pt'%(review_category)
        print('loading checkpoint from %s'%(checkpoint_path))
        trainer = ModelTrainer.load_checkpoint(checkpoint_path, corpus)
    ####### training the model
    print("training the review_category: %s model ..."%(review_category))
    try:
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs)
    except:
        print('chuncking batch ... by %d'%(params.mini_batch_chunk_size))
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs,
        mini_batch_chunk_size=params.mini_batch_chunk_size)
# Set up the Corpus
columns = {0: 'text', 1:'ner'}

data_folder = './data/IOBES'

corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="test.txt", test_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
    StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]),
    StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), CharacterEmbeddings() ])
])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
search_space.add(Parameter.USE_RNN, hp.choice, options=[True])


# initialise embeddings

param_selector = SequenceTaggerParamSelector(
    corpus,
    tag_type='ner',
Exemple #25
0
def init_embeddings(corpus_name, embedding_type):
    """
    Initializes embeddings for a given corpus.

    Parameters:
        corpus_name (str): name of the corpus used to load proper embeddings
        embedding_type (str): type of embeddings (e.g. flair, elmo, bert, word+char)
    
    Returns:
        tuple(StackedEmbeddings, bool): loaded embeddings
    """

    from typing import List
    from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings
    from flair.embeddings import FlairEmbeddings
    from flair.embeddings import BertEmbeddings, ELMoEmbeddings
    from flair.embeddings import WordEmbeddings, CharacterEmbeddings

    embedding_types: List[TokenEmbeddings] = []

    if corpus_name in ['conll03_en']:
        if embedding_type == 'flair':
            embedding_types.append(WordEmbeddings('glove'))
            embedding_types.append(FlairEmbeddings('news-forward'))
            embedding_types.append(FlairEmbeddings('news-backward'))
            embeddings_in_memory = True
        elif embedding_type == 'bert':
            embedding_types.append(
                BertEmbeddings(bert_model_or_path='bert-base-cased'))
            #embedding_types.append(BertEmbeddings(bert_model_or_path='bert-large-cased'))
            embeddings_in_memory = True
        elif embedding_type == 'elmo':
            embedding_types.append(ELMoEmbeddings())
            embeddings_in_memory = True
        elif embedding_type == 'word+char':
            # similar to Lample et al. (2016)
            embedding_types.append(WordEmbeddings('glove'))
            embedding_types.append(CharacterEmbeddings())
            embeddings_in_memory = False  # because it contains a char model (problem with deepcopy)
        else:
            log.error(f"no settings for '{embedding_type}'!")
            exit(EXIT_FAILURE)

    elif corpus_name in ["conll03_de", "germeval"]:
        if embedding_type == 'flair':
            embedding_types.append(WordEmbeddings('de'))
            embedding_types.append(FlairEmbeddings('german-forward'))
            embedding_types.append(FlairEmbeddings('german-backward'))
            embeddings_in_memory = True
        elif embedding_type == 'word+char':
            # similar to Lample et al. (2016)
            embedding_types.append(WordEmbeddings('de'))
            embedding_types.append(CharacterEmbeddings())
            embeddings_in_memory = False  # because it contains a char model (problem with deepcopy)
        else:
            log.error(f"no settings for '{embedding_type}'!")
            exit(EXIT_FAILURE)
    else:
        log.error(f"unknown corpus or embeddings '{corpus_name}'!")
        exit(EXIT_FAILURE)

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    log.info("'{}' function finished!".format(sys._getframe().f_code.co_name))

    return embeddings, embeddings_in_memory
# Set up the Corpus
columns = {0: 'text', 1:'ner'}

data_folder = './data/IOBES'

corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt")
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# define search_space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
    StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased') ]),
    StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased'), CharacterEmbeddings() ])
])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])


# initialise embeddings

param_selector = SequenceTaggerParamSelector(
    corpus,
    tag_type='ner',
    base_path="Optimisation_evals/stacks/flair_BERT/dev",
def train(args, tag_type):
    '''
    Training script to be run for training the ner model

    Parameters:
    -----------
    args:arguments passed to the parser on CLI
    '''
    data_dir = args.input_dir + '/data'
    corpus = ColumnCorpus(data_folder=data_dir,
                          column_format={
                              0: 'text',
                              1: 'ner'
                          },
                          train_file=args.train_file,
                          test_file=args.test_file,
                          dev_file=args.dev_file)

    # print(corpus.train[0])
    # print(corpus)

    # tag_type = 'ner'

    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    # print(tag_dictionary)

    if args.character_embeddings:
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            CharacterEmbeddings(),
            FlairEmbeddings(args.flair_model_name_or_path_forward),
            FlairEmbeddings(args.flair_model_name_or_path_backward),
        ]
    else:
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            FlairEmbeddings(args.flair_model_name_or_path_forward),
            FlairEmbeddings(args.flair_model_name_or_path_backward),
        ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # initialize sequence tagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    if (args.train_or_predict == "continue_train"):
        print("continue training")
        checkpoint = '/Users/titashneogi/workspace/NLP/NER/data/flair/cumulative_model/checkpoint.pt'
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # start training
    trainer.train(args.model_dir,
                  learning_rate=args.train_learning_rate,
                  mini_batch_size=args.per_gpu_batch_size,
                  max_epochs=args.num_train_epochs,
                  embeddings_storage_mode=args.embeddings_storage_mode)

    model = SequenceTagger.load(args.model_dir + '/final-model.pt')
    if (args.predict_file):
        with open(data_dir + args.predict_file, 'r') as f:
            str_file = f.read()

        sentence = Sentence(str_file)

        model.predict(sentence)
        print(sentence.to_tagged_string())
Exemple #28
0
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)
cachedir = Path(
    '/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/nishant/embeddings'
)
# # 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    CharacterEmbeddings(
        path_to_char_dict=
        "/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/abs/flair-custom/custom_dict.pkl"
    ),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # CharLMEmbeddings('news-forward',use_cache=True),
    ELMoEmbeddings('elmo-small'),
    # BertEmbeddings(),
    # FlairEmbeddings('news-backward-fast'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

#5. initialize sequence tagger
from flair.models import SequenceTagger
Exemple #29
0
search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS,
                 hp.choice,
                 options=[
                     StackedEmbeddings([
                         ELMoEmbeddings('original'),
                         FlairEmbeddings('news-forward'),
                         FlairEmbeddings('news-backward'),
                         BertEmbeddings('bert-large-cased')
                     ]),
                     StackedEmbeddings([
                         ELMoEmbeddings('original'),
                         FlairEmbeddings('news-forward'),
                         FlairEmbeddings('news-backward'),
                         BertEmbeddings('bert-large-cased'),
                         CharacterEmbeddings()
                     ])
                 ])
search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400)
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE,
                 hp.choice,
                 options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32])
search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])

# initialise embeddings

param_selector = SequenceTaggerParamSelector(
    corpus,
Exemple #30
0
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, BertEmbeddings, CharacterEmbeddings, ELMoEmbeddings

model_name = "glove"
runs = 1
use_glove = True
use_cui2vec = False
use_flair = False
use_elmo = False
use_bert = False
mini_batch_size = 32
word_embeddings = []

if use_glove:
    word_embeddings.append(WordEmbeddings('glove'))
    word_embeddings.append(CharacterEmbeddings())
if use_cui2vec:
    word_embeddings.append(WordEmbeddings('./cui2vec_embed_vectors.bin'))
if use_flair:
    word_embeddings.append(FlairEmbeddings('./forward-lm.pt'))
    word_embeddings.append(FlairEmbeddings('./backward-lm.pt'))
if use_elmo:
    word_embeddings.append(ELMoEmbeddings('pubmed'))
if use_bert:
    word_embeddings.append(BertEmbeddings('./bert-base-clinical-cased'))
    mini_batch_size = 8

stacked_word_embeddings = StackedEmbeddings(word_embeddings)

from flair.embeddings import DocumentRNNEmbeddings