def get_flair_embeddings():
     jw_forward: FlairEmbeddings = FlairEmbeddings("multi-forward",
                                                   chars_per_chunk=128)
     jw_backward: FlairEmbeddings = FlairEmbeddings("multi-backward",
                                                    chars_per_chunk=128)
     embeddings: list = [jw_forward, jw_backward]
     return embeddings
Ejemplo n.º 2
0
def main(data_folder: str, model_folder: str, dev_size: float,
         nb_epochs: int) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp,
                                                     data_folder=data_folder,
                                                     dev_size=dev_size)
    tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('fr'),
        FlairEmbeddings('fr-forward'),
        FlairEmbeddings('fr-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            use_crf=True,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner')

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(model_folder,
                  max_epochs=nb_epochs,
                  mini_batch_size=32,
                  embeddings_storage_mode="cpu",
                  checkpoint=False)
Ejemplo n.º 3
0
    def train():
        # load training data in FastText format
        corpus = NLPTaskDataFetcher.load_classification_corpus(
            Path('./'),
            test_file='./data/test.txt',
            train_file='./data/train.txt')

        # Combine different embeddings:
        # Glove word ebmeddings + Flair contextual string embeddings
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        # use LSTM based method for combining the different embeddings
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train('./models', max_epochs=10)
Ejemplo n.º 4
0
    def build_train_sequence_tagger(corpus,
                                    tag_dictionary,
                                    params: Params,
                                    TAG_TYPE="ner"):
        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=[
            WordEmbeddings("glove"),
            FlairEmbeddings("news-forward"),
            FlairEmbeddings("news-backward"),
        ])
        from flair.models import SequenceTagger

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=TAG_TYPE,
        )

        from flair.trainers import ModelTrainer

        corpus = Corpus(train=corpus.train, dev=corpus.dev, test=[])
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(
            "flair_checkpoints",
            train_with_dev=False,
            max_epochs=params.max_epochs,
            save_final_model=False,
        )  # original

        return tagger
Ejemplo n.º 5
0
def download_flair_models():
    w = WordEmbeddings("en-crawl")
    w = WordEmbeddings("news")
    w = FlairEmbeddings("news-forward-fast")
    w = FlairEmbeddings("news-backward-fast")
    w = FlairEmbeddings("mix-forward")
    w = BertEmbeddings("bert-base-uncased")
Ejemplo n.º 6
0
def get_scibert_flair_embeddings():
    return [
        TransformerWordEmbeddings(model="allenai/scibert_scivocab_uncased",
                                  fine_tune=True),
        FlairEmbeddings("pubmed-forward"),
        FlairEmbeddings("pubmed-backward")
    ]
Ejemplo n.º 7
0
def main():
    args = parse_args()

    if not os.path.exists(args.data_dir):
        raise Exception(f'Path does not exist: {args.data_dir}')

    # 1. Build corpus
    columns = {0: 'text', 1: 'ner'}
    corpus: Corpus = ColumnCorpus(args.data_dir,
                                  columns,
                                  train_file=args.train_file,
                                  dev_file=args.dev_file,
                                  test_file=args.test_file)

    print(corpus)
    print(corpus.obtain_statistics())

    # 2. What tag do we want to predict?
    tag_type = 'ner'

    # 3. Build tag dictionary
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # 4. Initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('crawl'),
        FlairEmbeddings(args.forward_flair_embeddings),
        FlairEmbeddings(args.backward_flair_embeddings),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. Initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=HIDDEN_SIZE,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    # 6. Initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    if args.learning_rate_find:
        print('***** Plotting learning rate')
        # 7a. Find learning rate
        learning_rate_tsv = trainer.find_learning_rate(
            'temp', 'learning_rate.tsv', mini_batch_size=MINI_BATCH_SIZE)

    else:
        print('***** Running train')
        # 7b. Run Training
        trainer.train(
            'temp',
            learning_rate=0.1,
            mini_batch_size=MINI_BATCH_SIZE,
            # it's a big dataset so maybe set embeddings_in_memory to False
            embeddings_storage_mode='none')

        tag_and_output(corpus.test, tagger,
                       os.path.join(args.data_dir, args.test_output_file),
                       tag_type)
Ejemplo n.º 8
0
def flair_embeddings(sentences, tokenized_contents, output_file=None):
    if output_file:
        f = open(output_file, 'w')
    # init embedding
    flair_embedding_forward = FlairEmbeddings('news-forward')
    for i, (sent, sent_tokens) in enumerate(zip(sentences,
                                                tokenized_contents)):
        print(
            "Encoding the {}th input sentence for Flair embedding!".format(i))
        # Getting the tokens from our own tokenized sentence!
        tokens: List[Token] = [Token(token) for token in sent_tokens]

        if len(tokens) != len(sent_tokens):
            raise ValueError("tokens length does not match sent_tokens length")

        # Create new empty sentence
        sentence = Sentence()

        # add our own tokens
        sentence.tokens = tokens

        flair_embedding_forward.embed(sentence)

        for token in sentence:

            if output_file:
                f.write(
                    token.text + " " +
                    " ".join([str(num)
                              for num in token.embedding.tolist()]) + '\n')
            else:
                print(token.text + " " +
                      " ".join([str(num)
                                for num in token.embedding.tolist()]) + '\n')
        f.write('\n')
Ejemplo n.º 9
0
def generate_topics_on_series(series):
    """https://towardsdatascience.com/covid-19-with-a-flair-2802a9f4c90f

    Returns:
        [type]: [description]
    """
    validate_text(series)

    # initialise embedding classes
    flair_embedding_forward = FlairEmbeddings("news-forward")
    flair_embedding_backward = FlairEmbeddings("news-backward")
    bert_embedding = BertEmbeddings("bert-base-uncased")

    # combine word embedding models
    document_embeddings = DocumentPoolEmbeddings(
        [bert_embedding, flair_embedding_backward, flair_embedding_forward])

    # set up empty tensor
    X = torch.empty(size=(len(series.index), 7168)).cuda()

    # fill tensor with embeddings
    i = 0
    for text in tqdm(series):
        sentence = Sentence(text)
        document_embeddings.embed(sentence)
        embedding = sentence.get_embedding()
        X[i] = embedding
        i += 1

    X = X.cpu().detach().numpy()
    torch.cuda.empty_cache()

    return X
Ejemplo n.º 10
0
    def __init__(
        self,
        embedding_type: str,
        datasets_manager: DatasetsManager = None,
        device: Union[str, torch.device] = "cpu",
        word_tokens_namespace: str = "tokens",
    ):
        """ Flair Embeddings. This is used to produce Named Entity Recognition. Note: This only
        works if your tokens are produced by splitting based on white space

        Parameters
        ----------
        embedding_type
        datasets_manager
        device
        word_tokens_namespace
        """
        super(FlairEmbedder, self).__init__()
        self.allowed_type = ["en", "news"]
        assert embedding_type in self.allowed_type
        self.embedder_forward = FlairEmbeddings(f"{embedding_type}-forward")
        self.embedder_backward = FlairEmbeddings(f"{embedding_type}-backward")
        self.embedder_name = f"FlairEmbedder-{embedding_type}"
        self.datasets_manager = datasets_manager
        self.device = torch.device(device) if isinstance(device, str) else device
        self.word_tokens_namespace = word_tokens_namespace
Ejemplo n.º 11
0
def test_visualize(resources_path):

    with open(resources_path / 'visual/snippet.txt') as f:
        sentences = [x for x in f.read().split('\n') if x]

    sentences = [Sentence(x) for x in sentences]

    embeddings = FlairEmbeddings('news-forward')

    visualizer = Visualizer()

    X_forward = visualizer.prepare_char_embeddings(embeddings, sentences)

    embeddings = FlairEmbeddings('news-backward')

    X_backward = visualizer.prepare_char_embeddings(embeddings, sentences)

    X = numpy.concatenate([X_forward, X_backward], axis=1)

    contexts = visualizer.char_contexts(sentences)

    trans_ = tSNE()
    reduced = trans_.fit(X)

    visualizer.visualize(reduced, contexts,
                         str(resources_path / 'visual/char_embeddings.html'))

    # clean up directory
    (resources_path / 'visual/char_embeddings.html').unlink()
Ejemplo n.º 12
0
    def __init__(self,
                 vocab: Vocabulary,
                 model_dir_or_name: str = 'en-base-uncased',
                 layers: str = '-1',
                 pool_method: str = 'first',
                 word_dropout=0,
                 dropout=0,
                 include_cls_sep: bool = False,
                 pooled_cls=True,
                 requires_grad: bool = True,
                 auto_truncate: bool = False,
                 **kwargs):

        super(FlairEmbedding, self).__init__(vocab,
                                             word_dropout=word_dropout,
                                             dropout=dropout)

        if word_dropout > 0:
            assert vocab.unknown is not None, "When word_drop>0, Vocabulary must contain the unknown token."

        self._word_sep_index = -100
        if '[SEP]' in vocab:
            self._word_sep_index = vocab['[SEP]']
        self._word_cls_index = -100
        if '[CLS]' in vocab:
            self._word_cls_index = vocab['CLS']

        self.vocab = vocab

        self.model = FlairEmbeddings(model=model_dir_or_name, fine_tune=False)

        self.requires_grad = requires_grad
        self._embed_size = self.model.embedding_length
class FlairEmbedding(EmbeddingBase):
    def __init__(self):
        self.forward_model = FlairEmbeddings("pl-forward")
        self.backward_model = FlairEmbeddings("pl-backward")
        self.size = 8192

    def _get_vector(self, forward: Sentence, backward: Sentence) -> np.ndarray:
        res = np.zeros(self.size, dtype=np.float32)
        for idx in range(len(forward)):
            out_fwd = np.fromiter(forward.tokens[idx].embedding.tolist(),
                                  dtype=np.float32)
            out_bwd = np.fromiter(backward.tokens[idx].embedding.tolist(),
                                  dtype=np.float32)
            out = np.hstack((out_fwd, out_bwd))
            res += out
        res /= len(forward)
        return res

    def batcher(self, params, batch: List[List[str]]) -> np.ndarray:
        batch = [
            Sentence(" ".join(sent)) if sent != [] else ['.'] for sent in batch
        ]
        embeddings = []
        outputs_forward = self.forward_model.embed(batch)
        outputs_backward = self.backward_model.embed(batch)
        for forward, backward in zip(outputs_forward, outputs_backward):
            embeddings.append(self._get_vector(forward, backward))
        embeddings = np.vstack(embeddings)
        return embeddings

    def dim(self) -> int:
        return self.size
Ejemplo n.º 14
0
def train():
    corpus: Corpus = ClassificationCorpus(sst_folder,
                                          test_file='test.csv',
                                          dev_file='dev.csv',
                                          train_file='sst_dev.csv')

    label_dict = corpus.make_label_dictionary()
    stacked_embedding = WordEmbeddings('glove')

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)
    trainer.train(model_path, max_epochs=10, train_with_dev=False)
Ejemplo n.º 15
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([WordEmbeddings(u'glove')]),
                         StackedEmbeddings([
                             WordEmbeddings(u'glove'),
                             FlairEmbeddings(u'news-forward'),
                             FlairEmbeddings(u'news-backward')
                         ])
                     ])
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)
    optimizer = SequenceTaggerParamSelector(corpus,
                                            u'ner',
                                            results_base_path,
                                            max_epochs=2)
    optimizer.optimize(search_space, max_evals=2)
    shutil.rmtree(results_base_path)
Ejemplo n.º 16
0
    def train(self):
        path = "./src/tmp/"
        self.training_data = self.convert_format(self.training_data)
    
        corpus: Corpus = ColumnCorpus(".", {0: 'text', 1: 'ner'},
                                    train_file=self.training_data
                                    )
        tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
        embedding_types: List[TokenEmbeddings] = [

            WordEmbeddings('fr'),
            FlairEmbeddings('fr-forward'),
            FlairEmbeddings('fr-backward'),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type='ner',
                                                use_crf=True)
        self.trainer = ModelTrainer(tagger, corpus)
        save_path = path + self.model_name
        self.trainer.train(save_path,learning_rate=self.learning_rate,mini_batch_size=self.batch_size, max_epochs=self.nb_iter,embeddings_storage_mode=self.mode)
        self.is_ready = 1
Ejemplo n.º 17
0
class Embedder:
    def __init__(self):
        self.embedder = FlairEmbeddings('news-forward-fast')
        self.embedding_length = self.__len__()

    def __len__(self):
        return self.embedder.embedding_length

    def __call__(self, sentences: np.ndarray):
        return self.embed(sentences)

    def embed(self, sentences: np.ndarray):
        if not isinstance(sentences, np.ndarray):
            raise TypeError(
                f'Expected numpy ndarray input got {type(sentences)}')

        if sentences.ndim != 2:
            raise TypeError(
                f'Expected numpy ndarray with 2 dims, try to A.reshape(-1, 1) '
            )

        sentences = [Sentence(sentence[0]) for sentence in sentences]

        self.embedder.embed(sentences)

        embeddings = []
        for sentence in sentences:
            embeddings.append(
                torch.stack([token.embedding.cpu() for token in sentence]))

        return embeddings
def test_train_language_model(results_base_path, resources_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(resources_path / 'corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = FlairEmbeddings(str(results_base_path / 'best-lm.pt'))
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)

    text, likelihood = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)

    # clean up results directory
    shutil.rmtree(results_base_path, ignore_errors=True)
Ejemplo n.º 19
0
def test_generate_text_with_small_temperatures():
    from flair.embeddings import FlairEmbeddings
    language_model = FlairEmbeddings(u'news-forward-fast').lm
    (text, likelihood) = language_model.generate_text(temperature=0.01,
                                                      number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)
def train():
    columns = {0: 'text', 1: 'pos'}
    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus('', columns,
                                  train_file=args.train,
                                  test_file=args.test,
                                  dev_file=args.dev)

    tag_dictionary = corpus.make_tag_dictionary(tag_type='pos')

    # initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        CharacterEmbeddings(),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='pos',
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(args.model,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)
Ejemplo n.º 21
0
def _get_embedding_model(
    model_name_or_path: Union[str, HFModelResult, FlairModelResult]
) -> Union[FlairEmbeddings, WordEmbeddings, TransformerWordEmbeddings,
           Sentence]:
    "Load the proper `Embeddings` model from `model_name_or_path`"
    if isinstance(model_name_or_path, FlairModelResult):
        nm = model_name_or_path.name
        try:
            return WordEmbeddings(nm.strip('flairNLP/'))
        except:
            return FlairEmbeddings(nm.strip('flairNLP/'))

    elif isinstance(model_name_or_path, HFModelResult):
        return TransformerWordEmbeddings(model_name_or_path.name)
    else:
        res = _flair_hub.search_model_by_name(model_name_or_path,
                                              user_uploaded=True)
        if len(res) < 1:
            # No models found
            res = _hf_hub.search_model_by_name(model_name_or_path,
                                               user_uploaded=True)
            if len(res) < 1:
                raise ValueError(
                    f'Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model'
                )
            else:
                return TransformerWordEmbeddings(
                    res[0].name
                )  # Returning the first should always be the non-fast option
        else:
            nm = res[0].name
            try:
                return WordEmbeddings(nm.strip('flairNLP/'))
            except:
                return FlairEmbeddings(nm.strip('flairNLP/'))
    def __init__(self,
                 parent_dir_dataset: {str},
                 dataset_name: {str},
                 parent_dir_data: {str},
                 parent_dir_model: {str},
                 sentiment_model_dir: {str},
                 word_embeddings: {list} = None):

        self.parent_dir_dataset = parent_dir_dataset
        self.dataset_name = dataset_name
        self.parent_dir_data = parent_dir_data
        self.parent_dir_model = parent_dir_model
        self.sentiment_model_dir = sentiment_model_dir

        self.dataset_filepath = os.path.join(self.parent_dir_dataset,
                                             self.dataset_name)
        self.model_filepath = os.path.join(self.parent_dir_model,
                                           self.sentiment_model_dir)
        self.train_filename = os.path.join(self.parent_dir_data, "train.csv")
        self.test_filename = os.path.join(self.parent_dir_data, "test.csv")
        self.dev_filename = os.path.join(self.parent_dir_data, "dev.csv")
        self.column_name_map = {}
        if word_embeddings is None:
            self.word_embeddings = [
                FlairEmbeddings('news-forward'),
                FlairEmbeddings('news-backward')
            ]
        else:
            self.word_embeddings = word_embeddings
        self.corpus = None
        self.document_RNNEmbeddings = None
        self.label = gv.label_sarcasm
        self.renamed_columns = gv.sarcasm_renamed_columns
        if gv.logger is None:
            gv.init_logger_object()
Ejemplo n.º 23
0
    def initialize_embeddings(self, fastbert=True, stackedembeddings=True):

        # Consider using pooling_operation="first", use_scalar_mix=True for the parameters

        # initialize individual embeddings
        if fastbert:
            bert_embedding = BertEmbeddings('distilbert-base-uncased',
                                            layers='-1')

        else:
            bert_embedding = BertEmbeddings('bert-base-cased', layers='-1')

        if stackedembeddings:
            glove_embedding = WordEmbeddings('glove')

            # init Flair forward and backwards embeddings
            flair_embedding_forward = FlairEmbeddings('news-forward')
            flair_embedding_backward = FlairEmbeddings('news-backward')

            embedding_types = [
                bert_embedding, glove_embedding, flair_embedding_forward,
                flair_embedding_backward
            ]

            embeddings = StackedEmbeddings(embeddings=embedding_types)

        else:

            embeddings = bert_embedding

        return embeddings
Ejemplo n.º 24
0
    def create_embeddings(self) -> StackedEmbeddings:

        embedding_types: List[FlairEmbeddings] = []
        
        if self.config['use_word_embeddings']:
            embedding_types.append(W2vWordEmbeddings(self.config['word_embeddings_path']))

        if self.config['use_char_embeddings']:
            embedding_types.append(CharacterEmbeddings())


        if self.config['use_flair_embeddings']:
            embedding_types.append(FlairEmbeddings('es-clinical-forward'))
            embedding_types.append(FlairEmbeddings('es-clinical-backward'))
        
        if self.config['use_beto_embeddings']:
            embedding_types.append(
                TransformerWordEmbeddings(
                    'dccuchile/bert-base-spanish-wwm-cased',
                    layers = self.config['layers'], 
                    layer_mean = self.config['layer_mean'], 
                    subtoken_pooling = self.config['subtoken_pooling']))

        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)
        return embeddings
Ejemplo n.º 25
0
def out_embedding(type_, model, n_layers, stacked=False):
    '''
	Create object of embedding type for later purpose
	:param:
		:type_: (str) type of embedding (currently there are only BERT or Flair embeddings)
		:model: (str) pretrained model of BERT embedding
		:n_layers: (int) number of last layers of trained BERT embeddings to be chosen
		:stacked: (bool) if this embedding is a combination of more embeddings (True/False)
	:return:
		:embedding: (BertEmbeddings / StackedEmbeddings) embedding object
	'''
    out_layers = ','.join([str(-i) for i in range(1, n_layers + 1)])
    if not stacked:
        if type_.lower() == 'bert':
            embedding = BertEmbeddings(bert_model_or_path=model,
                                       layers=out_layers)
            return embedding
        else:
            emb = WordEmbeddings('glove')
    else:
        emb = BertEmbeddings(bert_model_or_path=model, layers=out_layers)

    flair_forward = FlairEmbeddings('news-forward-fast')
    flair_backward = FlairEmbeddings('news-backward-fast')
    embedding = StackedEmbeddings(
        embeddings=[flair_forward, flair_backward, emb])

    return embedding
def flair_embeddings(sentences, output_file=None):
    if output_file:
        f = open(output_file, 'w')

    embedder = FlairEmbeddings(
        "multi-forward"
    )  #multilingual; you also have nl-forward; no french model though

    document_embedding = []
    for i, sent in enumerate(sentences):
        print("Encoding the {}th input sentence!".format(i))
        # create a sentence
        sentence = Sentence(" ".join(sent))

        # embed words in sentence
        embedder.embed(sentence)
        sentence_embedding = np.mean(
            [token.embedding.cpu().numpy() for token in sentence],
            axis=0)  #have to go from CUDA tensor to cpu tensor
        document_embedding.append(sentence_embedding)

        if output_file:
            for token in sentence:
                f.write(
                    token.text + "\t" +
                    "\t".join([str(num)
                               for num in token.embedding.tolist()]) + '\n')
    document_embedding = np.mean(document_embedding, axis=0)
    return document_embedding
Ejemplo n.º 27
0
    def __init__(self,
                 num_classes: int = 2,
                 bidirectional: bool = False,
                 rnn_layers: int = 1,
                 hidden_size: int = 256,
                 rnn_type: str = 'GRU'):

        super(ATAE_LSTM, self).__init__()

        self.stackedembeddings: StackedEmbeddings = StackedEmbeddings([
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward')
        ])
        self.wordembeddings: StackedEmbeddings = StackedEmbeddings(
            [WordEmbeddings('glove')])
        self.embedding_dimension: int = self.stackedembeddings.embedding_length + self.wordembeddings.embedding_length
        self.bidirectional: bool = bidirectional
        self.rnn_layers: int = rnn_layers
        self.rnn_type: str = rnn_type
        self.num_classes: int = num_classes
        self.hidden_size: int = hidden_size

        if self.rnn_type == 'GRU':
            self.rnn = torch.nn.GRU(self.embedding_dimension,
                                    self.hidden_size,
                                    bidirectional=self.bidirectional,
                                    num_layers=self.rnn_layers)
        else:
            self.rnn = torch.nn.LSTM(self.embedding_dimension,
                                     self.hidden_size,
                                     bidirectional=self.bidirectional,
                                     num_layers=self.rnn_layers)

        self.attention = Attention()
Ejemplo n.º 28
0
def load_context_embeddings_with_flair(direction='bi', word_embeddings=True, cache_dir=DEFAULT_CACHE_DIR,
                                       verbose=False):
    """

    :param bidirectional:
    :param cache_dir:
    :param verbose:
    """
    from flair.embeddings import FlairEmbeddings
    from flair.embeddings import WordEmbeddings
    from flair.embeddings import StackedEmbeddings

    embeddings = []

    if word_embeddings:
        fasttext_embedding = WordEmbeddings('da')
        embeddings.append(fasttext_embedding)

    if direction == 'bi' or direction == 'fwd':
        fwd_weight_path = download_model('flair.fwd', cache_dir, verbose=verbose, process_func=_unzip_process_func)
        embeddings.append(FlairEmbeddings(fwd_weight_path))

    if direction == 'bi' or direction == 'bwd':
        bwd_weight_path = download_model('flair.bwd', cache_dir, verbose=verbose, process_func=_unzip_process_func)
        embeddings.append(FlairEmbeddings(bwd_weight_path))

    if len(embeddings) == 1:
        return embeddings[0]

    return StackedEmbeddings(embeddings=embeddings)
Ejemplo n.º 29
0
def embed_dataset() -> List:
    # init standard GloVe embedding
    glove_embedding = WordEmbeddings('glove')
    flair_embedding_forward = FlairEmbeddings('news-forward')

    # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
    stacked_embeddings = StackedEmbeddings([
        glove_embedding,
        flair_embedding_forward,
    ])
    sentence_dataset = load_dataset(
        '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv'
    )

    embedded_sentences = []
    count = 0.0
    for s in sentence_dataset:
        sentence = Sentence(s)
        flair_embedding_forward.embed(sentence)
        embedded_sentences.append(sentence)
        if count % 50 == 0 or count == len(sentence_dataset):
            print('Processed {0:.1f}% of log lines.'.format(
                count * 100.0 / len(sentence_dataset)))
        count += 1
    words = []
    for sentence in embedded_sentences:
        for word in sentence:
            words.append(word.embedding)  #  TODO: is this correct? return all
    torch.save(words, '10k_depth_2_st_0.2.pt')
    return words
Ejemplo n.º 30
0
def optimize_lr():

    corpus, label_dictionary = load_corpus()

    embeddings = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward')
    ]

    document_embeddings = DocumentRNNEmbeddings(embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256,
                                                bidirectional=True)
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dictionary,
                                multi_label=False)
    trainer = ModelTrainer(classifier, corpus)

    # 7. find learning rate
    learning_rate_tsv = trainer.find_learning_rate('resources/classifiers/',
                                                   'learning_rate.tsv')

    # 8. plot the learning rate finder curve
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_learning_rate(learning_rate_tsv)