def other_embeddings(embd): sess = tf.InteractiveSession() train_data_list = [] test_data_list = [] if embd == 'glove': print('Starting Glove Embedding...') glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentPoolEmbeddings( embeddings=[glove_embedding]) elif embd == 'xlnet': print('Starting XLNet Embedding...') xlnet_embedding = XLNetEmbeddings('xlnet-large-cased') document_embeddings = DocumentPoolEmbeddings( embeddings=[xlnet_embedding]) elif embd == 'fasttext': print('Starting Fasttext Embedding...') fasttext_embedding = WordEmbeddings('en') document_embeddings = DocumentPoolEmbeddings( embeddings=[fasttext_embedding]) elif embd == 'elmo': print('Starting ELMo Embedding...') elmo_embedding = ELMoEmbeddings() document_embeddings = DocumentPoolEmbeddings( embeddings=[elmo_embedding]) else: # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') glove_embedding = WordEmbeddings('glove') # now create the DocumentPoolEmbeddings object that combines all embeddings document_embeddings = DocumentPoolEmbeddings(embeddings=[ glove_embedding, flair_forward_embedding, flair_backward_embedding ]) print('Train embedding Started...') for text in final_train['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() train_data_list.append(emb) print('Embedded Train data!!') print('Test embedding Started...') for text in final_test['text'].tolist(): text = Sentence(text) document_embeddings.embed(text) emb = text.get_embedding().detach().numpy() emb = tf.constant(emb).eval() test_data_list.append(emb) print('Embedded Test data!!') return train_data_list, test_data_list
def prepare_embeddings( tokens_list: List[List[str]], model_name: str = 'sl' ) -> Tuple[ np.ndarray, Dict[str, np.ndarray], Dict[str, Set[int]], List[Set[str]] ]: embedder = WordEmbeddings(model_name) word_embs = {} doc_embs = list() doc2word = list() word2doc = dict() for i, tokens in enumerate(tokens_list): sent = Sentence(" ".join(tokens)) embedder.embed(sent) doc_emb = np.zeros(embedder.embedding_length) doc2word.append(set()) for token in sent.tokens: if token.text not in word2doc: word2doc[token.text] = set() word2doc[token.text].add(i) doc2word[i].add(token.text) if token.text not in word_embs: emb = token.embedding.cpu().detach().numpy() word_embs[token.text] = emb else: emb = word_embs[token.text] doc_emb += emb / len(tokens) doc_embs.append(doc_emb) doc_embs = np.array(doc_embs) return doc_embs, word_embs, word2doc, doc2word
def train(): # load training data in FastText format corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='./data/test.txt', train_file='./data/train.txt') # Combine different embeddings: # Glove word ebmeddings + Flair contextual string embeddings word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] # use LSTM based method for combining the different embeddings document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./models', max_epochs=10)
def test_find_learning_rate(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 2: "ner" }) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) optimizer: Optimizer = SGD # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer) trainer.find_learning_rate(results_base_path, iterations=5) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus( data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"} ) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def __init__(self, num_classes: int = 2, bidirectional: bool = False, rnn_layers: int = 1, hidden_size: int = 256, rnn_type: str = 'GRU'): super(ATAE_LSTM, self).__init__() self.stackedembeddings: StackedEmbeddings = StackedEmbeddings([ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]) self.wordembeddings: StackedEmbeddings = StackedEmbeddings( [WordEmbeddings('glove')]) self.embedding_dimension: int = self.stackedembeddings.embedding_length + self.wordembeddings.embedding_length self.bidirectional: bool = bidirectional self.rnn_layers: int = rnn_layers self.rnn_type: str = rnn_type self.num_classes: int = num_classes self.hidden_size: int = hidden_size if self.rnn_type == 'GRU': self.rnn = torch.nn.GRU(self.embedding_dimension, self.hidden_size, bidirectional=self.bidirectional, num_layers=self.rnn_layers) else: self.rnn = torch.nn.LSTM(self.embedding_dimension, self.hidden_size, bidirectional=self.bidirectional, num_layers=self.rnn_layers) self.attention = Attention()
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / "multi_class" ) label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings = DocumentRNNEmbeddings( embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False, ) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, mini_batch_size=1, max_epochs=100, test_mode=True, checkpoint=False, ) sentence = Sentence("apple tv") for s in model.predict(sentence): for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float sentence = Sentence("apple tv") for s in model.predict(sentence): assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def main(data_folder: str, model_folder: str, dev_size: float, nb_epochs: int) -> None: nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type='ner') trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_folder, max_epochs=nb_epochs, mini_batch_size=32, embeddings_storage_mode="cpu", checkpoint=False)
def train_tagger(data_path, model_path): tag_type='ct' # define columns columns = {0: 'text', 1: 'pos', 2: 'ct'} # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_path, columns, train_file='train.tsv', test_file='test.tsv') # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), CharLMEmbeddings('news-forward'), CharLMEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tag_dictionary = corpus.make_tag_dictionary(tag_type='ct') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train(model_path, learning_rate=0.1, mini_batch_size=16, max_epochs=30)
def train(): # Get the SST-5 corpus corpus: Corpus = SENTEVAL_SST_GRANULAR() # create the label dictionary label_dict = corpus.make_label_dictionary() # make a list of word embeddings ( Using Glove for testing ) word_embeddings = [WordEmbeddings('glove')] # initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256) # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # start the training trainer.train('resources/taggers/trec', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, embeddings_storage_mode='gpu', max_epochs=15)
def __init__(self): print('Creating new model') self._name: str = 'SVM_binary' self._output_type: OutputType = OutputType.SINGLE_LABEL self._labels = ['positive', 'negative'] self._embedding = WordEmbeddings('glove') self._model = self._load_model()
def __init__(self): """ Virtually private constructor. """ if DocEmbeddings.__instance is not None: raise Exception("This class is a singleton!") else: doc_embeddings = DocumentPoolEmbeddings([WordEmbeddings("glove")]) DocEmbeddings.__instance = doc_embeddings
def post_init(self): from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \ DocumentPoolEmbeddings if self.model is not None: return embeddings_list = [] for e in self.embeddings: model_name, model_id = e.split(':', maxsplit=1) emb = None try: if model_name == 'flair': emb = FlairEmbeddings(model_id) elif model_name == 'pooledflair': emb = PooledFlairEmbeddings(model_id) elif model_name == 'word': emb = WordEmbeddings(model_id) elif model_name == 'byte-pair': emb = BytePairEmbeddings(model_id) except ValueError: self.logger.error('embedding not found: {}'.format(e)) continue if emb is not None: embeddings_list.append(emb) if embeddings_list: self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy) self.logger.info( 'initialize flair encoder with embeddings: {}'.format( self.embeddings))
def get_word_vectors(embedding, scale='none'): assert embedding in {'twitter', 'glove', 'crawl'} model = WordEmbeddings(embedding).precomputed_word_embeddings vectors = model.vectors if scale == 'z-standardization': print("[LOG] Scaling embeddings using {}".format(scale)) mu = vectors.mean(axis=0) sigma = vectors.std(axis=0) vectors = (vectors - mu) / sigma elif scale == 'normalization': print("[LOG] Scaling embeddings using {}".format(scale)) vectors = vectors / np.linalg.norm( vectors, ord=2, axis=1, keepdims=True) elif scale == 'scale(-1,1)': print("[LOG] Scaling embeddings using {}".format(scale)) min_vector = vectors.min(axis=0) max_vector = vectors.max(axis=0) min_target = -1 max_target = 1 vectors = ( (vectors - min_vector) / (max_vector - min_vector)) * (max_target - min_target) + min_target else: print("[LOG] Embeddings are not scaled and will be loaded as-is") return model.index2word, vectors
def __init__(self, embedding_method: Union[List, None] = None, min_similarity: float = 0.75, top_n: int = 1, cosine_method: str = "sparse", model_id: str = None): super().__init__(model_id) self.type = "Embeddings" if not embedding_method: self.document_embeddings = DocumentPoolEmbeddings( [WordEmbeddings('news')]) elif isinstance(embedding_method, list): self.document_embeddings = DocumentPoolEmbeddings(embedding_method) elif isinstance(embedding_method, TokenEmbeddings): self.document_embeddings = DocumentPoolEmbeddings( [embedding_method]) else: self.document_embeddings = embedding_method self.min_similarity = min_similarity self.top_n = top_n self.cosine_method = cosine_method self.embeddings_to = None
def __init__(self, *embeddings: str): print("May need a couple moments to instantiate...") self.embedding_stack = [] # Load correct Embeddings module for model_name_or_path in embeddings: if "bert" in model_name_or_path and "roberta" not in model_name_or_path: self.embedding_stack.append(BertEmbeddings(model_name_or_path)) elif "roberta" in model_name_or_path: self.embedding_stack.append( RoBERTaEmbeddings(model_name_or_path)) elif "gpt2" in model_name_or_path: self.embedding_stack.append( OpenAIGPT2Embeddings(model_name_or_path)) elif "xlnet" in model_name_or_path: self.embedding_stack.append( XLNetEmbeddings(model_name_or_path)) elif "xlm" in model_name_or_path: self.embedding_stack.append(XLMEmbeddings(model_name_or_path)) elif ("flair" in model_name_or_path or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES): self.embedding_stack.append( FlairEmbeddings(model_name_or_path)) else: try: self.embedding_stack.append( WordEmbeddings(model_name_or_path)) except ValueError: raise ValueError( f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model" ) assert len(self.embedding_stack) != 0 self.stacked_embeddings = StackedEmbeddings( embeddings=self.embedding_stack)
def __init__(self, embedding: WordEmbeddings, backend='sqlite', verbose=True): """ :param embedding: Flair WordEmbeddings instance. :param backend: cache database backend name e.g ``'sqlite'``, ``'lmdb'``. Default value is ``'sqlite'``. :param verbose: If `True` print information on standard output """ # some non-used parameter to allow print self._modules = dict() self.items = "" # get db filename from embedding name self.name = embedding.name self.store_path: Path = WordEmbeddingsStore._get_store_path( embedding, backend) if verbose: logger.info(f"store filename: {str(self.store_path)}") if backend == 'sqlite': self.backend = SqliteWordEmbeddingsStoreBackend(embedding, verbose) elif backend == 'lmdb': self.backend = LmdbWordEmbeddingsStoreBackend(embedding, verbose) else: raise ValueError( f'The given backend "{backend}" is not available.') # In case initialization of cached version failed, just fallback to the original WordEmbeddings if not self.backend.is_ok: self.backend = WordEmbeddings(embedding.embeddings)
def main(): args = parse_args() if not os.path.exists(args.data_dir): raise Exception(f'Path does not exist: {args.data_dir}') # 1. Build corpus columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(args.data_dir, columns, train_file=args.train_file, dev_file=args.dev_file, test_file=args.test_file) print(corpus) print(corpus.obtain_statistics()) # 2. What tag do we want to predict? tag_type = 'ner' # 3. Build tag dictionary tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # 4. Initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('crawl'), FlairEmbeddings(args.forward_flair_embeddings), FlairEmbeddings(args.backward_flair_embeddings), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. Initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=HIDDEN_SIZE, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) # 6. Initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) if args.learning_rate_find: print('***** Plotting learning rate') # 7a. Find learning rate learning_rate_tsv = trainer.find_learning_rate( 'temp', 'learning_rate.tsv', mini_batch_size=MINI_BATCH_SIZE) else: print('***** Running train') # 7b. Run Training trainer.train( 'temp', learning_rate=0.1, mini_batch_size=MINI_BATCH_SIZE, # it's a big dataset so maybe set embeddings_in_memory to False embeddings_storage_mode='none') tag_and_output(corpus.test, tagger, os.path.join(args.data_dir, args.test_output_file), tag_type)
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False ) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=2, test_mode=True ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def main(base_path, output_dir, nb_epochs): # parser = argparse.ArgumentParser() # parser.add_argument("--data_dir", default='./', type=str, required=True, help="The parent dir of inpu data, must contain folder name `conll_03`") # parser.add_argument("--output_dir", default=None, required=True, help="The output directory where is going to store the trained model") # parser.add_argument("--train_epochs", default=3, type=int, required=True, help="Number of epochs to train") # args = parser.parse_args() # base_path = args.data_dir corpus: Corpus = CONLL_03(base_path=base_path) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), PooledFlairEmbeddings('news-forward', pooling='min'), PooledFlairEmbeddings('news-backward', pooling='min'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # output_dir = args.output_dir # nb_epochs = args.train_epochs # output_dir = # nb_epochs = 10 trainer.train(output_dir, train_with_dev=False, max_epochs=nb_epochs) # 150
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path ) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( results_base_path / "checkpoint.pt", "SequenceTagger", corpus ) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def build_train_sequence_tagger(corpus, tag_dictionary, params: Params, TAG_TYPE="ner"): embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=[ WordEmbeddings("glove"), FlairEmbeddings("news-forward"), FlairEmbeddings("news-backward"), ]) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=TAG_TYPE, ) from flair.trainers import ModelTrainer corpus = Corpus(train=corpus.train, dev=corpus.dev, test=[]) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( "flair_checkpoints", train_with_dev=False, max_epochs=params.max_epochs, save_final_model=False, ) # original return tagger
def transform(self, X: dt.Frame): X.replace([None, math.inf, -math.inf], self._repl_val) from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence if self.embedding_name in ["glove", "en"]: self.embedding = WordEmbeddings(self.embedding_name) elif self.embedding_name in ["bert"]: self.embedding = BertEmbeddings() self.doc_embedding = DocumentPoolEmbeddings([self.embedding]) output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = Sentence(str(text1).lower()) self.doc_embedding.embed(text1) text2 = text2_arr[ind] text2 = Sentence(str(text2).lower()) self.doc_embedding.embed(text2) score = cosine_similarity(text1.get_embedding().reshape(1, -1), text2.get_embedding().reshape(1, -1))[0, 0] output.append(score) except: output.append(-99) return np.array(output)
def train(): # column format - word postag label columns = {0: "word", 1: "postag", 2: "ner"} data_folder = os.path.join(path, "../data/") # read train, dev and test set # here test set is same as dev set corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa") print(corpus) # create label dictionary tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner") print(tag_dictionary.idx2item) # using glove embeddings and character embeddings embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types) # create sequence tagger and trainer instance tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) model_path = os.path.join(path, "../models/") # commence training # model shall be saved in model_path under filename final-model.pt # this step takes at least 4 hours to complete, so please ensure access to GPU trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)
def test_training(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=10) # clean up results directory shutil.rmtree('./results')
def initialize_embeddings(self, fastbert=True, stackedembeddings=True): # Consider using pooling_operation="first", use_scalar_mix=True for the parameters # initialize individual embeddings if fastbert: bert_embedding = BertEmbeddings('distilbert-base-uncased', layers='-1') else: bert_embedding = BertEmbeddings('bert-base-cased', layers='-1') if stackedembeddings: glove_embedding = WordEmbeddings('glove') # init Flair forward and backwards embeddings flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') embedding_types = [ bert_embedding, glove_embedding, flair_embedding_forward, flair_embedding_backward ] embeddings = StackedEmbeddings(embeddings=embedding_types) else: embeddings = bert_embedding return embeddings
def test_train_load_use_classifier_with_prob(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence, multi_class_prob=True): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence, multi_class_prob=True) loaded_model.predict([sentence, sentence_empty], multi_class_prob=True) loaded_model.predict([sentence_empty], multi_class_prob=True) # clean up results directory shutil.rmtree(results_base_path)
def post_init(self): import flair flair.device = self.device from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \ DocumentPoolEmbeddings embeddings_list = [] for e in self.embeddings: model_name, model_id = e.split(':', maxsplit=1) emb = None try: if model_name == 'flair': emb = FlairEmbeddings(model_id) elif model_name == 'pooledflair': emb = PooledFlairEmbeddings(model_id) elif model_name == 'word': emb = WordEmbeddings(model_id) elif model_name == 'byte-pair': emb = BytePairEmbeddings(model_id) except ValueError: self.logger.error(f'embedding not found: {e}') continue if emb is not None: embeddings_list.append(emb) if embeddings_list: self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy) self.logger.info( f'flair encoder initialized with embeddings: {self.embeddings}' ) else: self.logger.error('flair encoder initialization failed.')
def embed_dataset() -> List: # init standard GloVe embedding glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') # create a StackedEmbedding object that combines glove and forward/backward flair embeddings stacked_embeddings = StackedEmbeddings([ glove_embedding, flair_embedding_forward, ]) sentence_dataset = load_dataset( '/Users/haraldott/Development/thesis/anomaly_detection_main/logparser/Drain/Drain_result/st_0.2 depth_2/openstack_normal_10k.csv' ) embedded_sentences = [] count = 0.0 for s in sentence_dataset: sentence = Sentence(s) flair_embedding_forward.embed(sentence) embedded_sentences.append(sentence) if count % 50 == 0 or count == len(sentence_dataset): print('Processed {0:.1f}% of log lines.'.format( count * 100.0 / len(sentence_dataset))) count += 1 words = [] for sentence in embedded_sentences: for word in sentence: words.append(word.embedding) # TODO: is this correct? return all torch.save(words, '10k_depth_2_st_0.2.pt') return words
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)