def other_embeddings(embd):
    sess = tf.InteractiveSession()
    train_data_list = []
    test_data_list = []
    if embd == 'glove':
        print('Starting Glove Embedding...')
        glove_embedding = WordEmbeddings('glove')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[glove_embedding])
    elif embd == 'xlnet':
        print('Starting XLNet Embedding...')
        xlnet_embedding = XLNetEmbeddings('xlnet-large-cased')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[xlnet_embedding])
    elif embd == 'fasttext':
        print('Starting Fasttext Embedding...')
        fasttext_embedding = WordEmbeddings('en')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[fasttext_embedding])
    elif embd == 'elmo':
        print('Starting ELMo Embedding...')
        elmo_embedding = ELMoEmbeddings()
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[elmo_embedding])
    else:
        # init Flair embeddings
        flair_forward_embedding = FlairEmbeddings('multi-forward')
        flair_backward_embedding = FlairEmbeddings('multi-backward')
        glove_embedding = WordEmbeddings('glove')
        # now create the DocumentPoolEmbeddings object that combines all embeddings
        document_embeddings = DocumentPoolEmbeddings(embeddings=[
            glove_embedding, flair_forward_embedding, flair_backward_embedding
        ])
    print('Train embedding Started...')
    for text in final_train['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        train_data_list.append(emb)
    print('Embedded Train data!!')
    print('Test embedding Started...')
    for text in final_test['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        test_data_list.append(emb)
    print('Embedded Test data!!')
    return train_data_list, test_data_list
コード例 #2
0
def prepare_embeddings(
    tokens_list: List[List[str]], model_name: str = 'sl'
) -> Tuple[
    np.ndarray, Dict[str, np.ndarray], Dict[str, Set[int]], List[Set[str]]
]:
    embedder = WordEmbeddings(model_name)
    word_embs = {}
    doc_embs = list()
    doc2word = list()
    word2doc = dict()

    for i, tokens in enumerate(tokens_list):
        sent = Sentence(" ".join(tokens))
        embedder.embed(sent)
        doc_emb = np.zeros(embedder.embedding_length)
        doc2word.append(set())
        for token in sent.tokens:
            if token.text not in word2doc:
                word2doc[token.text] = set()
            word2doc[token.text].add(i)
            doc2word[i].add(token.text)

            if token.text not in word_embs:
                emb = token.embedding.cpu().detach().numpy()
                word_embs[token.text] = emb
            else:
                emb = word_embs[token.text]

            doc_emb += emb / len(tokens)
        doc_embs.append(doc_emb)

    doc_embs = np.array(doc_embs)

    return doc_embs, word_embs, word2doc, doc2word
コード例 #3
0
    def train():
        # load training data in FastText format
        corpus = NLPTaskDataFetcher.load_classification_corpus(
            Path('./'),
            test_file='./data/test.txt',
            train_file='./data/train.txt')

        # Combine different embeddings:
        # Glove word ebmeddings + Flair contextual string embeddings
        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        # use LSTM based method for combining the different embeddings
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)

        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)

        trainer = ModelTrainer(classifier, corpus)
        trainer.train('./models', max_epochs=10)
コード例 #4
0
def test_find_learning_rate(results_base_path, tasks_base_path):
    corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path /
                                         "fashion",
                                         column_format={
                                             0: "text",
                                             2: "ner"
                                         })
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    optimizer: Optimizer = SGD

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer)

    trainer.find_learning_rate(results_base_path, iterations=5)

    # clean up results directory
    shutil.rmtree(results_base_path)
コード例 #5
0
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt")
    trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
コード例 #6
0
ファイル: ATAE_LSTM.py プロジェクト: HuBoMax/ATAE_LSTM
    def __init__(self,
                 num_classes: int = 2,
                 bidirectional: bool = False,
                 rnn_layers: int = 1,
                 hidden_size: int = 256,
                 rnn_type: str = 'GRU'):

        super(ATAE_LSTM, self).__init__()

        self.stackedembeddings: StackedEmbeddings = StackedEmbeddings([
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward')
        ])
        self.wordembeddings: StackedEmbeddings = StackedEmbeddings(
            [WordEmbeddings('glove')])
        self.embedding_dimension: int = self.stackedembeddings.embedding_length + self.wordembeddings.embedding_length
        self.bidirectional: bool = bidirectional
        self.rnn_layers: int = rnn_layers
        self.rnn_type: str = rnn_type
        self.num_classes: int = num_classes
        self.hidden_size: int = hidden_size

        if self.rnn_type == 'GRU':
            self.rnn = torch.nn.GRU(self.embedding_dimension,
                                    self.hidden_size,
                                    bidirectional=self.bidirectional,
                                    num_layers=self.rnn_layers)
        else:
            self.rnn = torch.nn.LSTM(self.embedding_dimension,
                                     self.hidden_size,
                                     bidirectional=self.bidirectional,
                                     num_layers=self.rnn_layers)

        self.attention = Attention()
コード例 #7
0
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path):
    # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path)
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        data_folder=tasks_base_path / "multi_class"
    )
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings = DocumentRNNEmbeddings(
        embeddings=[word_embedding],
        hidden_size=32,
        reproject_words=False,
        bidirectional=False,
    )

    model = TextClassifier(document_embeddings, label_dict, multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        EvaluationMetric.MICRO_F1_SCORE,
        mini_batch_size=1,
        max_epochs=100,
        test_mode=True,
        checkpoint=False,
    )

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert "apple" in sentence.get_label_names()
        assert "tv" in sentence.get_label_names()

        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
コード例 #8
0
def main(data_folder: str, model_folder: str, dev_size: float,
         nb_epochs: int) -> None:
    nlp = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)

    corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp,
                                                     data_folder=data_folder,
                                                     dev_size=dev_size)
    tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('fr'),
        FlairEmbeddings('fr-forward'),
        FlairEmbeddings('fr-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            use_crf=True,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner')

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(model_folder,
                  max_epochs=nb_epochs,
                  mini_batch_size=32,
                  embeddings_storage_mode="cpu",
                  checkpoint=False)
def train_tagger(data_path, model_path):
    tag_type='ct'
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'ct'}
    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_path, columns, train_file='train.tsv', test_file='test.tsv')


    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [

        WordEmbeddings('glove'),
        CharLMEmbeddings('news-forward'),
        CharLMEmbeddings('news-backward'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)


    tag_dictionary = corpus.make_tag_dictionary(tag_type='ct')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)



    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(model_path, learning_rate=0.1, mini_batch_size=16, max_epochs=30)
コード例 #10
0
def train():
    # Get the SST-5 corpus
    corpus: Corpus = SENTEVAL_SST_GRANULAR()

    # create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # make a list of word embeddings ( Using Glove for testing )
    word_embeddings = [WordEmbeddings('glove')]

    # initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)

    # create the text classifier
    classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

    # initialize the text classifier trainer
    trainer = ModelTrainer(classifier, corpus)

    # start the training
    trainer.train('resources/taggers/trec',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  embeddings_storage_mode='gpu',
                  max_epochs=15)
コード例 #11
0
ファイル: models.py プロジェクト: rafalposwiata/annobot
 def __init__(self):
     print('Creating new model')
     self._name: str = 'SVM_binary'
     self._output_type: OutputType = OutputType.SINGLE_LABEL
     self._labels = ['positive', 'negative']
     self._embedding = WordEmbeddings('glove')
     self._model = self._load_model()
コード例 #12
0
 def __init__(self):
     """ Virtually private constructor. """
     if DocEmbeddings.__instance is not None:
         raise Exception("This class is a singleton!")
     else:
         doc_embeddings = DocumentPoolEmbeddings([WordEmbeddings("glove")])
         DocEmbeddings.__instance = doc_embeddings
コード例 #13
0
    def post_init(self):
        from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \
            DocumentPoolEmbeddings

        if self.model is not None:
            return
        embeddings_list = []
        for e in self.embeddings:
            model_name, model_id = e.split(':', maxsplit=1)
            emb = None
            try:
                if model_name == 'flair':
                    emb = FlairEmbeddings(model_id)
                elif model_name == 'pooledflair':
                    emb = PooledFlairEmbeddings(model_id)
                elif model_name == 'word':
                    emb = WordEmbeddings(model_id)
                elif model_name == 'byte-pair':
                    emb = BytePairEmbeddings(model_id)
            except ValueError:
                self.logger.error('embedding not found: {}'.format(e))
                continue
            if emb is not None:
                embeddings_list.append(emb)
        if embeddings_list:
            self.model = DocumentPoolEmbeddings(embeddings_list,
                                                pooling=self.pooling_strategy)
            self.logger.info(
                'initialize flair encoder with embeddings: {}'.format(
                    self.embeddings))
コード例 #14
0
ファイル: embedding.py プロジェクト: RiTUAL-UH/trending_NER
def get_word_vectors(embedding, scale='none'):
    assert embedding in {'twitter', 'glove', 'crawl'}
    model = WordEmbeddings(embedding).precomputed_word_embeddings

    vectors = model.vectors

    if scale == 'z-standardization':
        print("[LOG] Scaling embeddings using {}".format(scale))
        mu = vectors.mean(axis=0)
        sigma = vectors.std(axis=0)
        vectors = (vectors - mu) / sigma

    elif scale == 'normalization':
        print("[LOG] Scaling embeddings using {}".format(scale))
        vectors = vectors / np.linalg.norm(
            vectors, ord=2, axis=1, keepdims=True)

    elif scale == 'scale(-1,1)':
        print("[LOG] Scaling embeddings using {}".format(scale))
        min_vector = vectors.min(axis=0)
        max_vector = vectors.max(axis=0)

        min_target = -1
        max_target = 1

        vectors = (
            (vectors - min_vector) /
            (max_vector - min_vector)) * (max_target - min_target) + min_target
    else:
        print("[LOG] Embeddings are not scaled and will be loaded as-is")

    return model.index2word, vectors
コード例 #15
0
    def __init__(self,
                 embedding_method: Union[List, None] = None,
                 min_similarity: float = 0.75,
                 top_n: int = 1,
                 cosine_method: str = "sparse",
                 model_id: str = None):
        super().__init__(model_id)
        self.type = "Embeddings"

        if not embedding_method:
            self.document_embeddings = DocumentPoolEmbeddings(
                [WordEmbeddings('news')])

        elif isinstance(embedding_method, list):
            self.document_embeddings = DocumentPoolEmbeddings(embedding_method)

        elif isinstance(embedding_method, TokenEmbeddings):
            self.document_embeddings = DocumentPoolEmbeddings(
                [embedding_method])

        else:
            self.document_embeddings = embedding_method

        self.min_similarity = min_similarity
        self.top_n = top_n
        self.cosine_method = cosine_method

        self.embeddings_to = None
コード例 #16
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
コード例 #17
0
ファイル: inference_utils.py プロジェクト: zzg-971030/flair
    def __init__(self,
                 embedding: WordEmbeddings,
                 backend='sqlite',
                 verbose=True):
        """
        :param embedding: Flair WordEmbeddings instance.
        :param backend: cache database backend name e.g ``'sqlite'``, ``'lmdb'``.
                        Default value is ``'sqlite'``.
        :param verbose: If `True` print information on standard output
        """
        # some non-used parameter to allow print
        self._modules = dict()
        self.items = ""

        # get db filename from embedding name
        self.name = embedding.name
        self.store_path: Path = WordEmbeddingsStore._get_store_path(
            embedding, backend)
        if verbose:
            logger.info(f"store filename: {str(self.store_path)}")

        if backend == 'sqlite':
            self.backend = SqliteWordEmbeddingsStoreBackend(embedding, verbose)
        elif backend == 'lmdb':
            self.backend = LmdbWordEmbeddingsStoreBackend(embedding, verbose)
        else:
            raise ValueError(
                f'The given backend "{backend}" is not available.')
        # In case initialization of cached version failed, just fallback to the original WordEmbeddings
        if not self.backend.is_ok:
            self.backend = WordEmbeddings(embedding.embeddings)
コード例 #18
0
def main():
    args = parse_args()

    if not os.path.exists(args.data_dir):
        raise Exception(f'Path does not exist: {args.data_dir}')

    # 1. Build corpus
    columns = {0: 'text', 1: 'ner'}
    corpus: Corpus = ColumnCorpus(args.data_dir,
                                  columns,
                                  train_file=args.train_file,
                                  dev_file=args.dev_file,
                                  test_file=args.test_file)

    print(corpus)
    print(corpus.obtain_statistics())

    # 2. What tag do we want to predict?
    tag_type = 'ner'

    # 3. Build tag dictionary
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # 4. Initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('crawl'),
        FlairEmbeddings(args.forward_flair_embeddings),
        FlairEmbeddings(args.backward_flair_embeddings),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. Initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=HIDDEN_SIZE,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    # 6. Initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    if args.learning_rate_find:
        print('***** Plotting learning rate')
        # 7a. Find learning rate
        learning_rate_tsv = trainer.find_learning_rate(
            'temp', 'learning_rate.tsv', mini_batch_size=MINI_BATCH_SIZE)

    else:
        print('***** Running train')
        # 7b. Run Training
        trainer.train(
            'temp',
            learning_rate=0.1,
            mini_batch_size=MINI_BATCH_SIZE,
            # it's a big dataset so maybe set embeddings_in_memory to False
            embeddings_storage_mode='none')

        tag_and_output(corpus.test, tagger,
                       os.path.join(args.data_dir, args.test_output_file),
                       tag_type)
コード例 #19
0
def test_train_load_use_classifier(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [word_embedding], 128, 1, False, 64, False, False
    )

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=2, test_mode=True
    )

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
コード例 #20
0
def main(base_path, output_dir, nb_epochs):
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--data_dir", default='./', type=str, required=True, help="The parent dir of inpu data, must contain folder name `conll_03`")
    # parser.add_argument("--output_dir", default=None, required=True, help="The output directory where is going to store the trained model")
    # parser.add_argument("--train_epochs", default=3, type=int, required=True, help="Number of epochs to train")
    # args = parser.parse_args()
    # base_path = args.data_dir
    corpus: Corpus = CONLL_03(base_path=base_path)
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        PooledFlairEmbeddings('news-forward', pooling='min'),
        PooledFlairEmbeddings('news-backward', pooling='min'),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # output_dir = args.output_dir
    # nb_epochs = args.train_epochs
    # output_dir =
    # nb_epochs = 10
    trainer.train(output_dir, train_with_dev=False,
                  max_epochs=nb_epochs)  # 150
コード例 #21
0
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpora(
        [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path
    )
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    trainer = ModelTrainer.load_from_checkpoint(
        results_base_path / "checkpoint.pt", "SequenceTagger", corpus
    )
    trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
コード例 #22
0
    def build_train_sequence_tagger(corpus,
                                    tag_dictionary,
                                    params: Params,
                                    TAG_TYPE="ner"):
        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=[
            WordEmbeddings("glove"),
            FlairEmbeddings("news-forward"),
            FlairEmbeddings("news-backward"),
        ])
        from flair.models import SequenceTagger

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=TAG_TYPE,
        )

        from flair.trainers import ModelTrainer

        corpus = Corpus(train=corpus.train, dev=corpus.dev, test=[])
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(
            "flair_checkpoints",
            train_with_dev=False,
            max_epochs=params.max_epochs,
            save_final_model=False,
        )  # original

        return tagger
コード例 #23
0
 def transform(self, X: dt.Frame):
     X.replace([None, math.inf, -math.inf], self._repl_val)
     from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
     if self.embedding_name in ["glove", "en"]:
         self.embedding = WordEmbeddings(self.embedding_name)
     elif self.embedding_name in ["bert"]:
         self.embedding = BertEmbeddings()
     self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
     output = []
     X = X.to_pandas()
     text1_arr = X.iloc[:, 0].values
     text2_arr = X.iloc[:, 1].values
     for ind, text1 in enumerate(text1_arr):
         try:
             text1 = Sentence(str(text1).lower())
             self.doc_embedding.embed(text1)
             text2 = text2_arr[ind]
             text2 = Sentence(str(text2).lower())
             self.doc_embedding.embed(text2)
             score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                       text2.get_embedding().reshape(1,
                                                                     -1))[0,
                                                                          0]
             output.append(score)
         except:
             output.append(-99)
     return np.array(output)
コード例 #24
0
def train():
    # column format - word postag label
    columns = {0: "word", 1: "postag", 2: "ner"}
    data_folder = os.path.join(path, "../data/")

    # read train, dev and test set
    # here test set is same as dev set
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa")
    print(corpus)

    # create label dictionary
    tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner")
    print(tag_dictionary.idx2item)

    # using glove embeddings and character embeddings
    embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)

    # create sequence tagger and trainer instance
    tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True)
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    model_path = os.path.join(path, "../models/")

    # commence training
    # model shall be saved in model_path under filename final-model.pt
    # this step takes at least 4 hours to complete, so please ensure access to GPU
    trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)
コード例 #25
0
def test_training():

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('glove')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                           corpus,
                                                           test_mode=True)

    trainer.train('./results',
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=10)

    # clean up results directory
    shutil.rmtree('./results')
コード例 #26
0
ファイル: train_sequencer.py プロジェクト: doehae/xrenner
    def initialize_embeddings(self, fastbert=True, stackedembeddings=True):

        # Consider using pooling_operation="first", use_scalar_mix=True for the parameters

        # initialize individual embeddings
        if fastbert:
            bert_embedding = BertEmbeddings('distilbert-base-uncased',
                                            layers='-1')

        else:
            bert_embedding = BertEmbeddings('bert-base-cased', layers='-1')

        if stackedembeddings:
            glove_embedding = WordEmbeddings('glove')

            # init Flair forward and backwards embeddings
            flair_embedding_forward = FlairEmbeddings('news-forward')
            flair_embedding_backward = FlairEmbeddings('news-backward')

            embedding_types = [
                bert_embedding, glove_embedding, flair_embedding_forward,
                flair_embedding_backward
            ]

            embeddings = StackedEmbeddings(embeddings=embedding_types)

        else:

            embeddings = bert_embedding

        return embeddings
コード例 #27
0
def test_train_load_use_classifier_with_prob(results_base_path,
                                             tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [word_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, max_epochs=2, shuffle=False)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence, multi_class_prob=True):
        for l in s.labels:
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence, multi_class_prob=True)
    loaded_model.predict([sentence, sentence_empty], multi_class_prob=True)
    loaded_model.predict([sentence_empty], multi_class_prob=True)

    # clean up results directory
    shutil.rmtree(results_base_path)
コード例 #28
0
 def post_init(self):
     import flair
     flair.device = self.device
     from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \
         DocumentPoolEmbeddings
     embeddings_list = []
     for e in self.embeddings:
         model_name, model_id = e.split(':', maxsplit=1)
         emb = None
         try:
             if model_name == 'flair':
                 emb = FlairEmbeddings(model_id)
             elif model_name == 'pooledflair':
                 emb = PooledFlairEmbeddings(model_id)
             elif model_name == 'word':
                 emb = WordEmbeddings(model_id)
             elif model_name == 'byte-pair':
                 emb = BytePairEmbeddings(model_id)
         except ValueError:
             self.logger.error(f'embedding not found: {e}')
             continue
         if emb is not None:
             embeddings_list.append(emb)
     if embeddings_list:
         self.model = DocumentPoolEmbeddings(embeddings_list,
                                             pooling=self.pooling_strategy)
         self.logger.info(
             f'flair encoder initialized with embeddings: {self.embeddings}'
         )
     else:
         self.logger.error('flair encoder initialization failed.')
コード例 #29
0
def embed_dataset() -> List:
    # init standard GloVe embedding
    glove_embedding = WordEmbeddings('glove')
    flair_embedding_forward = FlairEmbeddings('news-forward')

    # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
    stacked_embeddings = StackedEmbeddings([
        glove_embedding,
        flair_embedding_forward,
    ])
    sentence_dataset = load_dataset(
        '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv'
    )

    embedded_sentences = []
    count = 0.0
    for s in sentence_dataset:
        sentence = Sentence(s)
        flair_embedding_forward.embed(sentence)
        embedded_sentences.append(sentence)
        if count % 50 == 0 or count == len(sentence_dataset):
            print('Processed {0:.1f}% of log lines.'.format(
                count * 100.0 / len(sentence_dataset)))
        count += 1
    words = []
    for sentence in embedded_sentences:
        for word in sentence:
            words.append(word.embedding)  #  TODO: is this correct? return all
    torch.save(words, '10k_depth_2_st_0.2.pt')
    return words
コード例 #30
0
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.load_corpora(
        [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('glove')

    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(results_base_path,
                  learning_rate=0.1,
                  mini_batch_size=2,
                  max_epochs=2,
                  test_mode=True)

    loaded_model: SequenceTagger = SequenceTagger.load_from_file(
        results_base_path / 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)