Ejemplo n.º 1
0
def create_embeddings(params):
    embedding_type = params["embedding_type"]
    assert embedding_type in ["bert", "flair", "char"]
    if embedding_type == "bert":
        bert_embedding = BertEmbeddings(params["bert_model_dirpath_or_name"],
                                        pooling_operation="mean")

        embedding_types: List[TokenEmbeddings] = [bert_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "flair":
        glove_embedding = WordEmbeddings(
            '/opt/kanarya/glove/GLOVE/GloVe/vectors.gensim')
        word2vec_embedding = WordEmbeddings(
            '/opt/kanarya/huawei_w2v/vector.gensim')
        fast_text_embedding = WordEmbeddings('tr')
        char_embedding = CharacterEmbeddings()

        # bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32')
        embedding_types: List[TokenEmbeddings] = [
            fast_text_embedding, glove_embedding, word2vec_embedding,
            char_embedding
        ]
        # embedding_types: List[TokenEmbeddings] = [custom_embedding]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
    elif embedding_type == "char":
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=[CharacterEmbeddings()])
    else:
        embeddings = None

    return embeddings
Ejemplo n.º 2
0
 def __init__(self, pipeline):
     self.mode = pipeline.mode
     self.type = pipeline.embedding_type
     embedders = []
     for component in pipeline.embedders:
         if "forward" in component or "backward" in component:
             embedders.append(FlairEmbeddings(component))
         elif "glove" in component:
             embedders.append(WordEmbeddings(component))
         elif "bert" in component:
             embedders.append(BertEmbeddings(component))
         elif len(component) == 2:
             # see https://github.com/zalandoresearch/flair/blob/master/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md#fasttext-embeddings
             embedders.append(WordEmbeddings(component))
             embedders.append(BytePairEmbeddings(component))
         else:
             raise ValueError(f"unknown embedder: {component}")
     if self.type == "document":
         self.embedder = self._make_doc_embedder(pipeline, embedders)
     elif self.type == "word":
         self.embedder = StackedEmbeddings(embedders)
     elif self.type == "both":
         self.embedders = [
             self._make_doc_embedder(pipeline, embedders),
             StackedEmbeddings(embedders),
         ]
     else:
         raise ValueError(
             f"Innapropriate embedding type {pipeline.embedding_type}, "
             "should be 'word', 'document', or 'both'.")
Ejemplo n.º 3
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([WordEmbeddings(u'glove')]),
                         StackedEmbeddings([
                             WordEmbeddings(u'glove'),
                             FlairEmbeddings(u'news-forward'),
                             FlairEmbeddings(u'news-backward')
                         ])
                     ])
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)
    optimizer = SequenceTaggerParamSelector(corpus,
                                            u'ner',
                                            results_base_path,
                                            max_epochs=2)
    optimizer.optimize(search_space, max_evals=2)
    shutil.rmtree(results_base_path)
Ejemplo n.º 4
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                print(
                    f"Corresponding flair embedding module not found for {model_name_or_path}"
                )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
Ejemplo n.º 5
0
    def __init__(self,
                 num_classes: int = 2,
                 bidirectional: bool = False,
                 rnn_layers: int = 1,
                 hidden_size: int = 256,
                 rnn_type: str = 'GRU'):

        super(ATAE_LSTM, self).__init__()

        self.stackedembeddings: StackedEmbeddings = StackedEmbeddings([
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward')
        ])
        self.wordembeddings: StackedEmbeddings = StackedEmbeddings(
            [WordEmbeddings('glove')])
        self.embedding_dimension: int = self.stackedembeddings.embedding_length + self.wordembeddings.embedding_length
        self.bidirectional: bool = bidirectional
        self.rnn_layers: int = rnn_layers
        self.rnn_type: str = rnn_type
        self.num_classes: int = num_classes
        self.hidden_size: int = hidden_size

        if self.rnn_type == 'GRU':
            self.rnn = torch.nn.GRU(self.embedding_dimension,
                                    self.hidden_size,
                                    bidirectional=self.bidirectional,
                                    num_layers=self.rnn_layers)
        else:
            self.rnn = torch.nn.LSTM(self.embedding_dimension,
                                     self.hidden_size,
                                     bidirectional=self.bidirectional,
                                     num_layers=self.rnn_layers)

        self.attention = Attention()
Ejemplo n.º 6
0
    def __init__(self):

        # Sequence Tagging Model
        tagger_file = self.modelpath + 'tagger.pt'
        if Path(tagger_file).is_file():
            print('loading tagger from file')
            self.tagger = SequenceTagger.load_from_file(tagger_file)
        else:
            print('downloading pretrained tagger')
            self.tagger = SequenceTagger.load('ner-ontonotes')
            self.tagger.save(tagger_file)

        # Text Embedding Model
        embeddings_file = self.modelpath + 'embeddings.pickle'
        if Path(embeddings_file).is_file():
            print('loading embedder from file')
            filestream = open(embeddings_file, 'rb')
            self.embeddings = pickle.load(filestream)
        else:
            print('downloading pretrained embedders')
            self.embeddings = [
                # WordEmbeddings('glove'),
                FlairEmbeddings('multi-forward')
                # FlairEmbeddings('multi-backward')
            ]
            filestream = open(embeddings_file, 'wb')
            pickle.dump(self.embeddings, filestream)

        self.token_embedder = StackedEmbeddings(self.embeddings)
        self.doc_embedder = DocumentPoolEmbeddings(self.embeddings)
Ejemplo n.º 7
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
Ejemplo n.º 8
0
class Embedder(object):

    def __init__(self, embedding=None, method=None, batch_size=5):

        assert method in [None, "average"], "Bad method"
        self.method = method
        self.batch_size = batch_size

        if embedding is not None:
            self.embedding = StackedEmbeddings(embedding)
        else:
            self.embedding = StackedEmbeddings([
                #WordEmbeddings('glove'),
                #WordEmbeddings('en-news'),
                #BytePairEmbeddings('en'),
                WordEmbeddings('crawl')
            ])

    def embed_data(self, sentences):
        sentences = [Sentence(s) for s in sentences]
        self.embedding.embed(sentences)

        if self.method == "average":
            sentences = [torch.stack([word.embedding.detach().cpu() for word in s]).mean(
                0) for s in sentences]
        else:
            sentences = [torch.stack(
                [word.embedding.detach().cpu() for word in s]) for s in sentences]

        return sentences

    def embed_dataset(self, sentences):
        sentences = self.embed_data(sentences)
        return sentences
Ejemplo n.º 9
0
class FlairEmbeddings(object):
    def __init__(self):
        self.stop_words = list(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=[flair_embedding_forward, flair_embedding_backward])

    def word_token(self, tokens, lemma=False):
        tokens = str(tokens)
        tokens = re.sub(
            r"([\w].)([\~\!\@\#\$\%\^\&\*\(\)\-\+\[\]\{\}\/\"\'\:\;])([\s\w].)",
            "\\1 \\2 \\3", tokens)
        tokens = re.sub(r"\s+", " ", tokens)
        if lemma:
            return " ".join([
                self.lemmatizer.lemmatize(token, 'v')
                for token in word_tokenize(tokens.lower())
                if token not in self.stop_words and token.isalpha()
            ])
        else:
            return " ".join([
                token for token in word_tokenize(tokens.lower())
                if token not in self.stop_words and token.isalpha()
            ])

    def cos_sim(self, a, b):
        return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

    def getFlairEmbedding(self, text):
        sentence = Sentence(text)
        self.stacked_embeddings.embed(sentence)
        return np.mean([np.array(token.embedding) for token in sentence],
                       axis=0)
Ejemplo n.º 10
0
def test_stacked_embeddings():
    (sentence, glove, charlm) = init_document_embeddings()
    embeddings = StackedEmbeddings([glove, charlm])
    embeddings.embed(sentence)
    for token in sentence.tokens:
        assert (len(token.get_embedding()) == 1124)
        token.clear_embeddings()
        assert (len(token.get_embedding()) == 0)
Ejemplo n.º 11
0
class FlairPretrained(ModelBase):
    """
    Encapsulates pretrained Flair Embeddings (Zalando Flair) by conforming to the ModelBase interface.
    """
    def __init__(self, model=None):
        super(FlairPretrained, self).__init__()

        if model is not None:
            self.model = model
        else:
            self.model = StackedEmbeddings([
                FlairEmbeddings('news-forward-fast'),
                FlairEmbeddings('news-backward-fast'),
            ])

    def dim(self) -> int:
        """
        The dimensionality of created embeddings.

        :return: 2048 (for now, #TODO)
        """
        return 2048

    def get_word_vector(self, word: str) -> Optional[np.ndarray]:
        """
        Returns the word vector for word |word| or None. It is discouraged to use this method as it invalidates the
        purpose of Flair embeddings. Instead, utilize the context as well for more accurate vectorization.

        In reality, Flair embeddings never return None, even for bogus words.

        :param word: The word to vectorize.
        :return: Either the word vector or None.
        """
        dummy_sentence = Sentence(word)
        self.model.embed(dummy_sentence)
        return np.array(list(dummy_sentence)[0].embedding)

    def get_word_vectors(self, words: List[str]) -> List[np.ndarray]:
        """
        Vectorizes the list of words, using pretrained Flair embeddings. These embeddings are context dependent, so this
        method is preferred over fetching word vectors for single words.

        :param words: The list of words to vectorize.
        :return: A list of word vectors.
        """
        sentence = Sentence(' '.join(words))
        self.model.embed(sentence)
        return list(
            map(lambda token: np.array(token.embedding), list(sentence)))

    def vectorize_context(self, words: List[str]) -> Optional[np.ndarray]:
        """
        Transforms the context into a single vector. May return None in extreme cases, e.g. if |words| is an empty list.

        :param words: List of tokens describing the context.
        :return: A single word vector or None.
        """
        return self.mean_of_words(self.get_word_vectors(words))
Ejemplo n.º 12
0
class DefaultFeaturizerForSeqTagging(ObservationFeaturizer):
    def __init__(self,
                 action_space: ActionSpace,
                 embedding_type: str = "fasttext",
                 device: str = "cpu"):
        self.device = device
        self._setup_device()
        embeddings = EmbeddingRegistry.get_embedding(embedding_type)
        self.doc_embeddings = StackedEmbeddings(embeddings).to(
            torch.device(self.device))
        self.action_space = action_space
        self._current_token_embeddings: List[torch.tensor] = None

    def _setup_device(self):
        import flair, torch
        flair.device = torch.device(self.device)

    def init_on_reset(self, input_text: Union[List[str], str]):
        sent = Sentence(input_text)
        self.doc_embeddings.embed(sent)
        self._current_token_embeddings = [
            token.embedding.cpu().detach() for token in sent
        ]
        sent.clear_embeddings()

    def featurize(self, observation: Observation) -> torch.Tensor:
        input_vector = self._featurize_input(observation.get_current_index())
        context_vector = self._featurize_context(
            observation.get_current_action_history())
        concatenated = torch.cat((input_vector, context_vector), dim=0)
        return concatenated

    def get_observation_dim(self) -> int:
        return self._get_input_dim() + self._get_context_dim()

    def _featurize_input(self, input_index: int) -> torch.Tensor:
        input_features = self._current_token_embeddings[input_index]
        return input_features

    def _featurize_context(self, context: List[str]) -> torch.Tensor:
        # consider only last action
        context_vector = torch.zeros(self.action_space.size())
        context_ = [context[-1]] if len(context) > 0 else []
        action_indices = [
            self.action_space.action_to_ix(action) for action in context_
        ]
        context_vector[action_indices] = 1.0
        return context_vector

    def _get_input_dim(self):
        sent = Sentence("A random text to get the embedding dimension")
        self.doc_embeddings.embed(sent)
        dim = sent[0].embedding.shape[0]
        sent.clear_embeddings()
        return dim

    def _get_context_dim(self):
        return self.action_space.size()
Ejemplo n.º 13
0
def args_init(args):
    # initialize word2vec
    args.word2vec = KeyedVectors.load_word2vec_format('data/mymodel-new-5-%d' %
                                                      args.model_dim,
                                                      binary=True)

    # initialize contextual embedding dimensions
    if args.contextual_embedding == 'word2vec':
        args.word_dim = args.tag_dim = args.dis_dim = 50
        args.stacked_embeddings = 'word2vec'
    elif args.contextual_embedding == 'elmo':  #glove + elmo
        args.word_dim = args.tag_dim = args.dis_dim = 868
        ## stacked embeddings
        # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
        args.stacked_embeddings = StackedEmbeddings(
            [WordEmbeddings('glove'),
             ELMoEmbeddings('small')])

    elif args.contextual_embedding == 'bert':  #glove + bert
        args.word_dim = args.tag_dim = args.dis_dim = 3172
        args.stacked_embeddings = StackedEmbeddings(
            [WordEmbeddings('glove'),
             BertEmbeddings('bert-base-uncased')])
        args.batch_size = 8

    elif args.contextual_embedding == 'flair':  #glove + flair-forward + flair-backward
        args.word_dim = args.tag_dim = args.dis_dim = 4196
        args.stacked_embeddings = StackedEmbeddings([
            WordEmbeddings('glove'),
            FlairEmbeddings('mix-forward', chars_per_chunk=128),
            FlairEmbeddings('mix-backward', chars_per_chunk=128)
        ])
        if args.agent_mode == 'act':
            args.batch_size = 8
        else:
            args.batch_size = 8

    elif args.contextual_embedding == 'glove':  # not tested
        args.word_dim = args.tag_dim = args.dis_dim = 100
        args.stacked_embeddings = StackedEmbeddings([
            WordEmbeddings('glove'),
        ])

    # weights loaded, set exploration rate to minimum
    if args.load_weights:  # 1 to 0.1. decayed to minimum.
        args.exploration_rate_start = args.exploration_rate_end

    # agent mode arguments, set number of words to 100
    if args.agent_mode == 'arg':
        args.num_words = args.context_len
        args.display_training_result = 0

    args.result_dir = 'results/%s_%s_%s' % (args.domain, args.agent_mode,
                                            args.contextual_embedding)

    return args
Ejemplo n.º 14
0
def train_model(directory='Data', use_BERT=True):
    # define columns
    columns = {
        0: 'ID',
        1: 'text',
        2: 'empty_0',
        3: 'pos',
        4: 'empty_1',
        5: 'empty_2',
        6: 'empty_3',
        7: 'empty_4',
        8: 'empty_5',
        9: 'tox'
    }

    # this is the folder in which train, test and dev files reside
    data_folder = directory

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='converted_data_train.conll',
                                  test_file='converted_data_test.conll',
                                  dev_file='converted_data_dev.conll')

    # tag to predict
    tag_type = 'tox'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # embeddings
    if use_BERT:
        bert_embeddings = [
            TransformerWordEmbeddings('bert-large-uncased', fine_tune=True)
        ]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=bert_embeddings)
    else:
        embedding_types = [WordEmbeddings('glove')]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

    # initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # start training
    trainer.train('resources/taggers/toxic_classifier_bert',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=5)
Ejemplo n.º 15
0
    def __init__(self, model=None):
        super(FlairPretrained, self).__init__()

        if model is not None:
            self.model = model
        else:
            self.model = StackedEmbeddings([
                FlairEmbeddings('news-forward-fast'),
                FlairEmbeddings('news-backward-fast'),
            ])
Ejemplo n.º 16
0
 def __init__(self,
              action_space: ActionSpace,
              embedding_type: str = "fasttext",
              device: str = "cpu"):
     self.device = device
     self._setup_device()
     embeddings = EmbeddingRegistry.get_embedding(embedding_type)
     self.doc_embeddings = StackedEmbeddings(embeddings).to(
         torch.device(self.device))
     self.action_space = action_space
     self._current_token_embeddings: List[torch.tensor] = None
Ejemplo n.º 17
0
    def __init__(self, forward, backward, use_tokenizer, *args, **kwargs):

        super(FlairEmbeddings, self).__init__(*args, **kwargs)

        self._forward = forward
        self._backward = backward
        self._use_tokenizer = use_tokenizer

        from flair.embeddings import FlairEmbeddings as FLEmbeddings
        from flair.embeddings import StackedEmbeddings

        self._embeddings = StackedEmbeddings([FLEmbeddings(forward), FLEmbeddings(backward)])
Ejemplo n.º 18
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            self.embedding_stack.append(
                _get_embedding_model(model_name_or_path))

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
Ejemplo n.º 19
0
    def __init__(self, corpus, emb_path, flair=False):

        self.word2index, self.word_emb = self.get_pretrain_embeddings(emb_path, corpus.get_word_vocab())
        self.index2word = {i: w for w, i in self.word2index.items()}
        self.flair_words = None

        if config.if_flair or flair:
            # self.elmo = ELMoEmbeddings()
            # self.bert_embedding = BertEmbeddings('bert-base-cased')
            self.flair_forward_embedding = FlairEmbeddings('news-forward')
            self.flair_backward_embedding = FlairEmbeddings('news-backward')
            self.stacked_embeddings = StackedEmbeddings(
                embeddings=[self.flair_forward_embedding, self.flair_backward_embedding])
    def __init__(self,gpu):
        super(LayerFlairEmbeddings, self).__init__(gpu)
        self.gpu = gpu
        # self.flair_embeddings_dim = flair_embeddings_dim
        # self.freeze_flair_embeddings = freeze_flair_embeddings

        self.output_dim = 4096

        self.flair_embedding_forward = FlairEmbeddings('/home/jlfu/flair_model/news-forward-0.4.1.pt')
        self.flair_embedding_backward = FlairEmbeddings('/home/jlfu/flair_model/news-backward-0.4.1.pt')
        self.stacked_embeddings = StackedEmbeddings([
            self.flair_embedding_forward,
            self.flair_embedding_backward
        ])
Ejemplo n.º 21
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                         "fashion",
                                         column_format={
                                             0: "text",
                                             2: "ner"
                                         })

    # define search space
    search_space = SearchSpace()

    # sequence tagger parameter
    search_space.add(
        Parameter.EMBEDDINGS,
        hp.choice,
        options=[
            StackedEmbeddings([WordEmbeddings("glove")]),
            StackedEmbeddings([
                WordEmbeddings("glove"),
                FlairEmbeddings("news-forward-fast"),
                FlairEmbeddings("news-backward-fast"),
            ]),
        ],
    )
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])

    # model trainer parameter
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD])

    # training parameter
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)

    # find best parameter settings
    optimizer = SequenceTaggerParamSelector(corpus,
                                            "ner",
                                            results_base_path,
                                            max_epochs=2)
    optimizer.optimize(search_space, max_evals=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
Ejemplo n.º 22
0
    def init_emb(self):
        # init standard GloVe embedding
        flair.device = torch.device("cpu")
        glove_embedding = WordEmbeddings('glove')

        # init Flair forward and backwards embeddings
        flair_embedding_forward = FlairEmbeddings('news-forward')
        flair_embedding_backward = FlairEmbeddings('news-backward')
        # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
        self.stacked_embeddings = StackedEmbeddings([
            glove_embedding,
            flair_embedding_forward,
            flair_embedding_backward,
        ])
Ejemplo n.º 23
0
    def __init__(self):
        """
        initialize the word embedding and document embedding classes
        """
        self.word_embedding = flair.embeddings.WordEmbeddings('glove')
        self.doc_embedding = flair.embeddings.DocumentPoolEmbeddings([self.word_embedding])

        # embedding
        self.flair_forward = FlairEmbeddings('news-forward-fast')
        self.backward_flair = FlairEmbeddings('news-backward-fast')

        # stacked embedding
        self.stacked_embedding = StackedEmbeddings(embeddings=[
            self.flair_forward,
            self.backward_flair])
Ejemplo n.º 24
0
    def __init__(self, embedding=None, method=None, batch_size=5):

        assert method in [None, "average"], "Bad method"
        self.method = method
        self.batch_size = batch_size

        if embedding is not None:
            self.embedding = StackedEmbeddings(embedding)
        else:
            self.embedding = StackedEmbeddings([
                #WordEmbeddings('glove'),
                #WordEmbeddings('en-news'),
                #BytePairEmbeddings('en'),
                WordEmbeddings('crawl')
            ])
class LayerFlairEmbeddings(LayerBase):
    """LayerBertEmbeddings implements character-level embeddings."""
    def __init__(self,gpu):
        super(LayerFlairEmbeddings, self).__init__(gpu)
        self.gpu = gpu
        # self.flair_embeddings_dim = flair_embeddings_dim
        # self.freeze_flair_embeddings = freeze_flair_embeddings

        self.output_dim = 4096

        self.flair_embedding_forward = FlairEmbeddings('/home/jlfu/flair_model/news-forward-0.4.1.pt')
        self.flair_embedding_backward = FlairEmbeddings('/home/jlfu/flair_model/news-backward-0.4.1.pt')
        self.stacked_embeddings = StackedEmbeddings([
            self.flair_embedding_forward,
            self.flair_embedding_backward
        ])
        # self.glove_embedding = WordEmbeddings('glove')
        # self.args= args
        # if self.args.use_flair_glove:
        #     self.stacked_embeddings = StackedEmbeddings([
        #         self.glove_embedding,
        #         self.flair_embedding_forward,
        #         self.flair_embedding_backward
        #     ])
        #     self.output_dim = 4096



    def is_cuda(self):
        return self.embeddings.weight.is_cuda

    def forward(self, word_sequences):
        batch_size = len(word_sequences)
        max_seq_len = max([len(word_seq) for word_seq in word_sequences])
        flair_embedding = torch.zeros(batch_size, max_seq_len, self.output_dim)

        # create a sentence
        for i,word_sequence in enumerate(word_sequences):
            word_seq_str = ' '.join(word_sequence)
            sentence = Sentence(word_seq_str)
            # self.flair_embedding_forward.embed(sentence)
            self.stacked_embeddings.embed(sentence)
            for j,token in enumerate(sentence):
                # print('token.embedding',token.embedding)
                flair_embedding[i][j][:] = token.embedding
            # print('flair_embedding',flair_embedding)
            # break
        return flair_embedding
Ejemplo n.º 26
0
def main(data_folder: str, model_folder: str, dev_size: float,
         nb_epochs: int) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp,
                                                     data_folder=data_folder,
                                                     dev_size=dev_size)
    tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('fr'),
        FlairEmbeddings('fr-forward'),
        FlairEmbeddings('fr-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            use_crf=True,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner')

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(model_folder,
                  max_epochs=nb_epochs,
                  mini_batch_size=32,
                  embeddings_storage_mode="cpu",
                  checkpoint=False)
Ejemplo n.º 27
0
def out_embedding(type_, model, n_layers, stacked=False):
    '''
	Create object of embedding type for later purpose
	:param:
		:type_: (str) type of embedding (currently there are only BERT or Flair embeddings)
		:model: (str) pretrained model of BERT embedding
		:n_layers: (int) number of last layers of trained BERT embeddings to be chosen
		:stacked: (bool) if this embedding is a combination of more embeddings (True/False)
	:return:
		:embedding: (BertEmbeddings / StackedEmbeddings) embedding object
	'''
    out_layers = ','.join([str(-i) for i in range(1, n_layers + 1)])
    if not stacked:
        if type_.lower() == 'bert':
            embedding = BertEmbeddings(bert_model_or_path=model,
                                       layers=out_layers)
            return embedding
        else:
            emb = WordEmbeddings('glove')
    else:
        emb = BertEmbeddings(bert_model_or_path=model, layers=out_layers)

    flair_forward = FlairEmbeddings('news-forward-fast')
    flair_backward = FlairEmbeddings('news-backward-fast')
    embedding = StackedEmbeddings(
        embeddings=[flair_forward, flair_backward, emb])

    return embedding
Ejemplo n.º 28
0
    def initialize_embeddings(self, fastbert=True, stackedembeddings=True):

        # Consider using pooling_operation="first", use_scalar_mix=True for the parameters

        # initialize individual embeddings
        if fastbert:
            bert_embedding = BertEmbeddings('distilbert-base-uncased',
                                            layers='-1')

        else:
            bert_embedding = BertEmbeddings('bert-base-cased', layers='-1')

        if stackedembeddings:
            glove_embedding = WordEmbeddings('glove')

            # init Flair forward and backwards embeddings
            flair_embedding_forward = FlairEmbeddings('news-forward')
            flair_embedding_backward = FlairEmbeddings('news-backward')

            embedding_types = [
                bert_embedding, glove_embedding, flair_embedding_forward,
                flair_embedding_backward
            ]

            embeddings = StackedEmbeddings(embeddings=embedding_types)

        else:

            embeddings = bert_embedding

        return embeddings
Ejemplo n.º 29
0
def embed_dataset() -> List:
    # init standard GloVe embedding
    glove_embedding = WordEmbeddings('glove')
    flair_embedding_forward = FlairEmbeddings('news-forward')

    # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
    stacked_embeddings = StackedEmbeddings([
        glove_embedding,
        flair_embedding_forward,
    ])
    sentence_dataset = load_dataset(
        '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv'
    )

    embedded_sentences = []
    count = 0.0
    for s in sentence_dataset:
        sentence = Sentence(s)
        flair_embedding_forward.embed(sentence)
        embedded_sentences.append(sentence)
        if count % 50 == 0 or count == len(sentence_dataset):
            print('Processed {0:.1f}% of log lines.'.format(
                count * 100.0 / len(sentence_dataset)))
        count += 1
    words = []
    for sentence in embedded_sentences:
        for word in sentence:
            words.append(word.embedding)  #  TODO: is this correct? return all
    torch.save(words, '10k_depth_2_st_0.2.pt')
    return words
Ejemplo n.º 30
0
def train():
    # column format - word postag label
    columns = {0: "word", 1: "postag", 2: "ner"}
    data_folder = os.path.join(path, "../data/")

    # read train, dev and test set
    # here test set is same as dev set
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa")
    print(corpus)

    # create label dictionary
    tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner")
    print(tag_dictionary.idx2item)

    # using glove embeddings and character embeddings
    embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

    # create sequence tagger and trainer instance
    tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True)
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    model_path = os.path.join(path, "../models/")

    # commence training
    # model shall be saved in model_path under filename final-model.pt
    # this step takes at least 4 hours to complete, so please ensure access to GPU
    trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)