def init_document_embeddings(): text = 'I love Berlin. Berlin is a great place to live.' sentence: Sentence = Sentence(text) glove: TokenEmbeddings = WordEmbeddings('en-glove') charlm: TokenEmbeddings = CharLMEmbeddings('news-forward-fast') return sentence, glove, charlm
def init(tasks_base_path) -> Tuple[(Corpus, Dictionary, TextClassifier)]: corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'ag_news')) label_dict = corpus.make_label_dictionary() glove_embedding = WordEmbeddings('turian') document_embeddings = DocumentRNNEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return (corpus, label_dict, model)
def post_init(self): from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings, FlairEmbeddings self._flair = DocumentPoolEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ], pooling=self.pooling_strategy)
def init_document_embeddings(): text = "I love Berlin. Berlin is a great place to live." sentence: Sentence = Sentence(text) glove: TokenEmbeddings = WordEmbeddings("turian") charlm: TokenEmbeddings = FlairEmbeddings("news-forward-fast") return sentence, glove, charlm
def init(tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding = WordEmbeddings(u'en-glove') document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return (corpus, label_dict, model)
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) # define search space search_space = SearchSpace() # sequence tagger parameter search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([WordEmbeddings('glove')]), StackedEmbeddings([ WordEmbeddings('glove'), CharLMEmbeddings('news-forward'), CharLMEmbeddings('news-backward') ]) ]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75) search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25) search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128, 256, 512]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) # model trainer parameter search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD, AdamW]) # training parameter search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32]) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1) search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1) # find best parameter settings optimizer = SequenceTaggerParamSelector(corpus, 'ner', results_base_path, max_epochs=2) optimizer.optimize(search_space, max_evals=2) # clean up results directory shutil.rmtree(results_base_path)
def create_flair_embeddings(emb_name): emb_type, emb_subname = emb_name.split('+') if emb_type == 'elmo': return ELMoEmbeddings(emb_subname) elif emb_type == 'fasttext': return WordEmbeddings(emb_subname) elif emb_type == 'custom_elmo': return ELMoEmbeddings(options_file=Path(emb_subname) / 'options.json', weight_file=Path(emb_subname) / 'model.hdf5')
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "multi_class") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings = DocumentRNNEmbeddings( embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False, ) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, mini_batch_size=1, max_epochs=100, shuffle=False, checkpoint=False, ) sentence = Sentence("apple tv") for s in model.predict(sentence): for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float sentence = Sentence("apple tv") for s in model.predict(sentence): assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def _set_up_model(self, params: dict, label_dictionary): document_embedding = params['document_embeddings'].__name__ if document_embedding == "DocumentRNNEmbeddings": embedding_params = { key: params[key] for key, value in params.items() if key in DOCUMENT_RNN_EMBEDDING_PARAMETERS } embedding_params['embeddings'] = [ WordEmbeddings(TokenEmbedding) if type(params['embeddings']) == list else WordEmbeddings(params['embeddings']) for TokenEmbedding in params['embeddings'] ] document_embedding = DocumentRNNEmbeddings(**embedding_params) elif document_embedding == "DocumentPoolEmbeddings": embedding_params = { key: params[key] for key, value in params.items() if key in DOCUMENT_POOL_EMBEDDING_PARAMETERS } embedding_params['embeddings'] = [ WordEmbeddings(TokenEmbedding) for TokenEmbedding in params['embeddings'] ] document_embedding = DocumentPoolEmbeddings(**embedding_params) elif document_embedding == "TransformerDocumentEmbeddings": embedding_params = { key: params[key] for key, value in params.items() if key in DOCUMENT_TRANSFORMER_EMBEDDING_PARAMETERS } document_embedding = TransformerDocumentEmbeddings( **embedding_params) else: raise Exception("Please provide a flair document embedding class") text_classifier: TextClassifier = TextClassifier( label_dictionary=label_dictionary, multi_label=self.multi_label, document_embeddings=document_embedding, ) return text_classifier
def get_embeddings(self): embeddings = [ PolyglotEmbeddings(self.args.lang), CharacterEmbeddings() ] if not self.args.lang in self.embeds_unsupported_langs: embeddings.append(WordEmbeddings(self.args.lang)) return StackedEmbeddings(embeddings=embeddings)
def main(args): args = parser.parse_args() # 1. get the corpus column_format = {0: 'word', 1: 'pos', 2: 'ner'} corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( Path(args.data_file[0]), column_format, tag_to_biloes='ner') print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), # comment in these lines to use Bert embeddings # BertEmbeddings(), # comment in these lines to use Elmo embeddings # ELMoEmbeddings(), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/glove', learning_rate=0.1, mini_batch_size=32, max_epochs=50)
def get_embeddings(emb_name): emb_type, emb_name = emb_name.split('+') if emb_type == 'elmo': return lambda: ELMoEmbeddings(emb_name) # pubmed elif emb_type == 'fasttext': return lambda: WordEmbeddings(emb_name) # en else: raise ValueError('Wrong embedding type')
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / 'multi_class') label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding], hidden_size=32, reproject_words=False, bidirectional=False) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=100, test_mode=True, checkpoint=False) sentence = Sentence('apple tv') for s in model.predict(sentence): for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) sentence = Sentence("apple tv") for s in model.predict(sentence): assert ('apple' in sentence.get_label_names()) assert ('tv' in sentence.get_label_names()) for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def de_lang(cls): """ Factory method for german embeddings """ embeddings = WordEmbeddings('de') # German FastText embeddings # embeddings = WordEmbeddings('de-crawl') # German FastText embeddings trained over crawls #embeddings = BertEmbeddings('bert-base-multilingual-cased') return cls(embeddings)
def test_stacked_embeddings(): text = 'I love Berlin.' sentence: Sentence = Sentence(text) glove: TokenEmbeddings = WordEmbeddings('en-glove') news: TokenEmbeddings = WordEmbeddings('en-news') charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward') embeddings: StackedEmbeddings = StackedEmbeddings([glove, news, charlm]) embeddings.embed(sentence) for token in sentence.tokens: assert (len(token.get_embedding()) != 0) token.clear_embeddings() assert (len(token.get_embedding()) == 0)
def init(tasks_base_path) -> Tuple[(Corpus, TextRegressor, ModelTrainer)]: corpus = NLPTaskDataFetcher.load_corpus(NLPTask.REGRESSION, tasks_base_path) glove_embedding = WordEmbeddings('glove') document_embeddings = DocumentRNNEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextRegressor(document_embeddings) trainer = ModelTrainer(model, corpus) return (corpus, model, trainer)
def train_model(directory='Data', use_BERT=True): # define columns columns = { 0: 'ID', 1: 'text', 2: 'empty_0', 3: 'pos', 4: 'empty_1', 5: 'empty_2', 6: 'empty_3', 7: 'empty_4', 8: 'empty_5', 9: 'tox' } # this is the folder in which train, test and dev files reside data_folder = directory # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='converted_data_train.conll', test_file='converted_data_test.conll', dev_file='converted_data_dev.conll') # tag to predict tag_type = 'tox' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # embeddings if use_BERT: bert_embeddings = [ TransformerWordEmbeddings('bert-large-uncased', fine_tune=True) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=bert_embeddings) else: embedding_types = [WordEmbeddings('glove')] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # start training trainer.train('resources/taggers/toxic_classifier_bert', learning_rate=0.1, mini_batch_size=32, max_epochs=5)
def init() -> Tuple[TaggedCorpus, Dictionary, TextClassifier]: corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return corpus, label_dict, model
def collect_features(embeddings): for embedding in embeddings: if embedding in {"fasttext"}: yield WordEmbeddings("de") elif embedding in {"bert"}: yield BertEmbeddings("bert-base-multilingual-cased", layers="-1") elif embedding in {"flair-forward"}: yield FlairEmbeddings("german-forward") elif embedding in {"flair-backward"}: yield FlairEmbeddings("german-backward")
def get_doc_embeddings(): # initialize the word embeddings glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward-fast') flair_embedding_backward = FlairEmbeddings('news-backward-fast') # initialize the document embeddings, mode = mean return DocumentPoolEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_forward], fine_tune_mode='none')
def load_flair(mode = 'flair'): if mode == 'flair': stacked_embeddings = StackedEmbeddings([ WordEmbeddings('glove'), PooledFlairEmbeddings('news-forward', pooling='min'), PooledFlairEmbeddings('news-backward', pooling='min') ]) else:##bert stacked_embeddings = BertEmbeddings('bert-base-uncased') ##concat last 4 layers give the best return stacked_embeddings
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName): ids = df['id'].tolist() nSamples = len(ids) idx80 = int(nSamples * 0.7) idx90 = int(nSamples * 0.9) train_ids = ids[:idx80] test_ids = ids[idx80:idx90] dev_ids = ids[idx90:] with TemporaryDirectory() as temp_dir: trainCsv = temp_dir + trainNameCsv testCsv = temp_dir + testNameCsv devCsv = temp_dir + devNameCsv df[df['id'].isin(train_ids)].to_csv(trainCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(test_ids)].to_csv(testCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(dev_ids)].to_csv(devCsv, columns=columns, sep='\t', index=False, header=False) corpus = NLPTaskDataFetcher.load_classification_corpus( temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv) word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(temp_dir, max_epochs=50) classifier.save(classifierFileName)
def __init__(self, device_number='cuda:2', use_cuda = True): self.device_number = device_number if use_cuda: flair.device = torch.device(self.device_number) self.stacked_embeddings = StackedEmbeddings([WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ])
def __init__(self, hidden_dim: int, rnn_type: str, vocab_size: int, tagset_size: int, task_type: str): super(TaskLearner, self).__init__() self.task_type = task_type self.rnn_type = rnn_type self.bidirectional = True self.num_layers = 2 self.num_directions = 2 if self.bidirectional else 1 # Word Embeddings (TODO: Implement pre-trained word embeddings) # self.word_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim) # TODO: Implement padding_idx=self.pad_idx embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), PooledFlairEmbeddings('news-forward', pooling='min'), PooledFlairEmbeddings('news-backward', pooling='min') ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) self.embeddings = embeddings self.embedding_dim: int = self.embeddings.embedding_length if self.rnn_type == 'gru': rnn = nn.GRU elif self.rnn_type == 'lstm': rnn = nn.LSTM elif self.rnn_type == 'rnn': rnn = nn.RNN else: raise ValueError # Sequence tagger self.rnn = rnn(input_size=self.embedding_dim, hidden_size=hidden_dim, num_layers=self.num_layers, dropout=0.0 if self.num_layers == 1 else 0.5, bidirectional=self.bidirectional, batch_first=True) if self.task_type == 'SEQ': # Linear layer that maps hidden state space from rnn to tag space self.hidden2tag = nn.Linear(in_features=hidden_dim * self.num_directions, out_features=tagset_size) if self.task_type == 'CLF': # COME BACK LATER... self.drop = nn.Dropout(p=0.5) self.hidden2tag = nn.Linear(in_features=hidden_dim * self.num_directions, out_features=1)
def init(tasks_base_path) -> Tuple[TaggedCorpus, Dictionary, TextClassifier]: corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('turian') document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return corpus, label_dict, model
def embedding(): # initialize the word embeddings glove_embedding = WordEmbeddings('glove') flair_embedding_forward = FlairEmbeddings('news-forward') flair_embedding_backward = FlairEmbeddings('news-backward') # initialize the document embeddings, mode = mean document_embeddings = DocumentPoolEmbeddings( [glove_embedding, flair_embedding_backward, flair_embedding_forward]) return document_embeddings
def create_embeddings_flair(data: pd.DataFrame, column: str = "text", path: str = None, embeddings_type: str = "tranformer", typs: str = "train"): assert column in data.columns.tolist( ), "[embeddings.py] -> [create_embedding_flair] -> Input column not in dataframe columns" assert embeddings_type in ["tranformer", "stacked"] from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings from flair.data import Sentence fast_text_embedding = WordEmbeddings('de') flair_embedding_forward = FlairEmbeddings('de-forward') flair_embedding_backward = FlairEmbeddings('de-backward') stacked_embeddings = DocumentPoolEmbeddings([ fast_text_embedding, flair_embedding_forward, flair_embedding_backward ]) transformer_embedding = TransformerDocumentEmbeddings( 'bert-base-german-cased', fine_tune=False) tic = time.time() embeddings = [] for i, text in enumerate(data[column].values): print("sentence {}/{}".format(i, len(data))) sentence = Sentence(text) if embeddings_type == "stacked": stacked_embeddings.embed(sentence) elif embeddings_type == "tranformer": transformer_embedding.embed(sentence) embedding = sentence.embedding.detach().cpu().numpy() embeddings.append(embedding) embeddings = np.array(embeddings) columns = [ "embedding_{}".format(feature) for feature in range(embeddings.shape[1]) ] csv = pd.DataFrame(embeddings, columns=columns) csv.to_csv(path + embeddings_type + "_" + typs + ".csv", index=False) toc = time.time() print( "[create_embeddings_flair] -> [embeddings_type: {}, typs: {}] -> time {}'s" .format(embeddings_type, typs, toc - tic))
def init(tasks_base_path) -> Tuple[Corpus, Dictionary, TextClassifier]: # get training, test and dev data corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "ag_news") label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, multi_label=False) return corpus, label_dict, model
def load_and_apply_word_embeddings(emb_type: str): text = "I love Berlin." sentence: Sentence = Sentence(text) embeddings: TokenEmbeddings = WordEmbeddings(emb_type) embeddings.embed(sentence) for token in sentence.tokens: assert len(token.get_embedding()) != 0 token.clear_embeddings() assert len(token.get_embedding()) == 0
def en_lang(cls): """ Factory method for english embeddings """ #embeddings = WordEmbeddings('en-glove') embeddings = WordEmbeddings( 'en-crawl') # FastText embeddings over web crawls #embeddings = WordEmbeddings('en-news') #embeddings = FlairEmbeddings('news-forward') #embeddings = BertEmbeddings() return cls(embeddings)