def embedize(self, data_subset_list): tweet = Sentence(data_subset_list) embedding = TransformerDocumentEmbeddings(self.embedding) embedding.embed(tweet) tweet_emb = tweet.get_embedding() tweet_emb_np = tweet_emb.detach().numpy() return (tweet_emb_np)
class Embedding: """ Performs embedding on sentences. """ def __init__(self, model='gpt2-medium'): """ Initializes the embedding model. :param {str} model - The model architecture. Must be one of https://huggingface.co/transformers/pretrained_models.html """ self.model = TransformerDocumentEmbeddings(model, batch_size=8) def embed(self, sentence: str) -> list: """ Embeds a given sentence. If it fails, returns None. :param {str} sentence - A cased or uncased sentence. """ if isinstance(sentence, bytes): sentence = sentence.decode('ascii') if isinstance(sentence, list): sentence = ' '.join(sentence) if sentence == '': return None try: sent = Sentence(sentence) self.model.embed(sent) return sent.embedding.detach().cpu().numpy() except TypeError: return None
def __init__(self, model='gpt2-medium'): """ Initializes the embedding model. :param {str} model - The model architecture. Must be one of https://huggingface.co/transformers/pretrained_models.html """ self.model = TransformerDocumentEmbeddings(model, batch_size=8)
def create_embeddings_flair(data: pd.DataFrame, column: str = "text", path: str = None, embeddings_type: str = "tranformer", typs: str = "train"): assert column in data.columns.tolist( ), "[embeddings.py] -> [create_embedding_flair] -> Input column not in dataframe columns" assert embeddings_type in ["tranformer", "stacked"] from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings from flair.data import Sentence fast_text_embedding = WordEmbeddings('de') flair_embedding_forward = FlairEmbeddings('de-forward') flair_embedding_backward = FlairEmbeddings('de-backward') stacked_embeddings = DocumentPoolEmbeddings([ fast_text_embedding, flair_embedding_forward, flair_embedding_backward ]) transformer_embedding = TransformerDocumentEmbeddings( 'bert-base-german-cased', fine_tune=False) tic = time.time() embeddings = [] for i, text in enumerate(data[column].values): print("sentence {}/{}".format(i, len(data))) sentence = Sentence(text) if embeddings_type == "stacked": stacked_embeddings.embed(sentence) elif embeddings_type == "tranformer": transformer_embedding.embed(sentence) embedding = sentence.embedding.detach().cpu().numpy() embeddings.append(embedding) embeddings = np.array(embeddings) columns = [ "embedding_{}".format(feature) for feature in range(embeddings.shape[1]) ] csv = pd.DataFrame(embeddings, columns=columns) csv.to_csv(path + embeddings_type + "_" + typs + ".csv", index=False) toc = time.time() print( "[create_embeddings_flair] -> [embeddings_type: {}, typs: {}] -> time {}'s" .format(embeddings_type, typs, toc - tic))
def vectorize(self, X): # init embedding model print(f"Load {self.model_name} model ...") model = TransformerDocumentEmbeddings(self.model_name, fine_tune=False) # convert to Sentence objects print("Convert to Sentence objects ...") X = X.str.lower() sentences = X.progress_apply(lambda x: Sentence(x)) # get vectors from BERT print(f"Get {self.model_name} embeddings ...") docvecs = sentences.progress_apply(lambda x: model.embed(x)) docvecs = sentences.progress_apply(lambda x: x.embedding.cpu().numpy()) return list(docvecs)
def test_text_classifier_transformer_finetune(results_base_path, tasks_base_path): flair.set_seed(123) corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_single", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=TransformerDocumentEmbeddings( "distilbert-base-uncased"), label_dictionary=label_dict, label_type="city", multi_label=False, ) trainer = ModelTrainer(model, corpus) trainer.fine_tune( results_base_path, mini_batch_size=2, max_epochs=10, shuffle=True, learning_rate=0.5e-5, num_workers=2, ) # check if model can predict sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") model.predict(sentence) model.predict([sentence, sentence_empty]) model.predict([sentence_empty]) # load model loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # chcek if model predicts correct label sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") loaded_model.predict([sentence, sentence_empty]) values = [] for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float values.append(label.value) assert "Berlin" in values # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city") assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def __init__( self, task_name: str, label_dictionary: Dictionary, label_type: str, embeddings: str = 'bert-base-uncased', num_negative_labels_to_sample: int = 2, prefix: bool = True, **tagger_args, ): """ Initializes a TextClassifier :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased' etc :param num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a sentence during training. Defaults to 2 negative labels for each positive label. The model would sample all the negative labels if None is passed. That slows down the training considerably. :param multi_label: auto-detected by default, but you can set this to True to force multi-label predictionor False to force single-label prediction :param multi_label_threshold: If multi-label you can set the threshold to make predictions :param beta: Parameter for F-beta score for evaluation and training annealing """ super(TARSClassifier, self).__init__() from flair.embeddings import TransformerDocumentEmbeddings if not isinstance(embeddings, TransformerDocumentEmbeddings): embeddings = TransformerDocumentEmbeddings(model=embeddings, fine_tune=True, layers='-1', layer_mean=False, ) # prepare TARS dictionary tars_dictionary = Dictionary(add_unk=False) tars_dictionary.add_item('False') tars_dictionary.add_item('True') # initialize a bare-bones sequence tagger self.tars_model = TextClassifier(document_embeddings=embeddings, label_dictionary=tars_dictionary, label_type=self.static_label_type, **tagger_args, ) # transformer separator self.separator = str(self.tars_embeddings.tokenizer.sep_token) if self.tars_embeddings.tokenizer._bos_token: self.separator += str(self.tars_embeddings.tokenizer.bos_token) self.prefix = prefix self.num_negative_labels_to_sample = num_negative_labels_to_sample # Store task specific labels since TARS can handle multiple tasks self.add_and_switch_to_new_task(task_name, label_dictionary, label_type)
def _load_transformer_model(self, transformer_model: str, sentence_transformer: bool): logger.info("Loading transformer model...") if sentence_transformer: try: self.embedding = SentenceTransformerDocumentEmbeddings( transformer_model, ) except OSError as e: logger.error("Could not load transformer model: " + str(e)) exit() else: try: self.embedding = TransformerDocumentEmbeddings( transformer_model, fine_tune=False) except OSError as e: logger.error("Could not load sentence transformer model: " + str(e)) exit() logger.info("Done loading transformer model!")
def generate_doc_embedding(document: str, embeddings: list, doc2vec="transformer_roberta"): doc_embedding: np.ndarray = np.array([]) try: logger.info("Generating embedding for document .... ") # 1. Initialise Document Embedding # a) Pooling if doc2vec == "pool": document_embeddings: DocumentPoolEmbeddings = DocumentPoolEmbeddings( embeddings=embeddings) elif doc2vec == "rnn": document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( embeddings=embeddings, hidden_size=256, rnn_type='LSTM') # b) Transformer elif doc2vec == "transformer_bert": document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings( 'bert-base-multilingual-cased') else: document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings( 'roberta-base') # 2. Create an example sentence sentence: Sentence = Sentence(document) # 3. Embed the sentence with our document embedding document_embeddings.embed(sentence) # 4. Save embedding into CPU if "cuda" in str(flair.device).lower(): doc_emb_cpu: Tensor = sentence.embedding.cpu() # 5. Convert to numpy array doc_embedding: np.ndarray = doc_emb_cpu.detach().numpy() else: doc_embedding: np.ndarray = sentence.get_embedding().detach( ).numpy() except Exception as e: logger.error(e) return doc_embedding
def _set_up_model(self, params: dict): text_classification_params = { key: params[key] for key in params if key in TEXT_CLASSIFICATION_PARAMETERS } document_embedding = TransformerDocumentEmbeddings( fine_tune=self.fine_tune, **text_classification_params) text_classifier: TextClassifier = TextClassifier( label_dictionary=self.label_dictionary, multi_label=self.multi_label, label_type=self.label_type, document_embeddings=document_embedding, ) return text_classifier
def _set_up_model(self, params: dict, label_dictionary): document_embedding = params['document_embeddings'].__name__ if document_embedding == "DocumentRNNEmbeddings": embedding_params = { key: params[key] for key, value in params.items() if key in DOCUMENT_RNN_EMBEDDING_PARAMETERS } embedding_params['embeddings'] = [ WordEmbeddings(TokenEmbedding) if type(params['embeddings']) == list else WordEmbeddings(params['embeddings']) for TokenEmbedding in params['embeddings'] ] document_embedding = DocumentRNNEmbeddings(**embedding_params) elif document_embedding == "DocumentPoolEmbeddings": embedding_params = { key: params[key] for key, value in params.items() if key in DOCUMENT_POOL_EMBEDDING_PARAMETERS } embedding_params['embeddings'] = [ WordEmbeddings(TokenEmbedding) for TokenEmbedding in params['embeddings'] ] document_embedding = DocumentPoolEmbeddings(**embedding_params) elif document_embedding == "TransformerDocumentEmbeddings": embedding_params = { key: params[key] for key, value in params.items() if key in DOCUMENT_TRANSFORMER_EMBEDDING_PARAMETERS } document_embedding = TransformerDocumentEmbeddings( **embedding_params) else: raise Exception("Please provide a flair document embedding class") text_classifier: TextClassifier = TextClassifier( label_dictionary=label_dictionary, multi_label=self.multi_label, document_embeddings=document_embedding, ) return text_classifier
def load_model(bert=None, document=False, flair=False): """Load word embeddings model.""" if bert == 'bio': # https://github.com/flairNLP/flair/issues/1085 # also see readme for instructions bertpath = './bert/bert-base-biobert-cased' elif bert == 'sci': # https://github.com/flairNLP/flair/issues/744 # https://github.com/flairNLP/flair/issues/1239 bertpath = './bert/scibert_scivocab_uncased' else: bertpath = 'bert-base-uncased' if document and not flair: bert_embedding = TransformerDocumentEmbeddings(model=bertpath, batch_size=4) return bert_embedding bert_embedding = TransformerWordEmbeddings(model=bertpath, pooling_operation='first', batch_size=4) if flair: flair_embedding_forward = FlairEmbeddings('en-forward') flair_embedding_backward = FlairEmbeddings('en-backward') embed_arr = [ bert_embedding, flair_embedding_backward, flair_embedding_forward, ] else: embed_arr = [bert_embedding] if document: document_embeddings = DocumentPoolEmbeddings( embed_arr, fine_tune_mode='nonlinear') else: document_embeddings = StackedEmbeddings(embed_arr) return document_embeddings
) # print the number of Sentences in the train split print(len(corpus.train)) # print the number of Sentences in the test split print(len(corpus.test)) # print the number of Sentences in the dev split print(len(corpus.dev)) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. initialize transformer document embeddings (many models are available) document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True) # 4. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # 5. initialize the text classifier trainer with Adam optimizer trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # 6. start the training trainer.train( './model_result', learning_rate=3e-5, # use very small learning rate mini_batch_size=16, mini_batch_chunk_size= 4, # optionally set this if transformer is too much for your machine max_epochs=5, # terminate after 5 epochs
def train( review_category, params, update_model= False, learning_rate=0.01, embeddings_storage_mode='gpu', checkpoint= True, batch_growth_annealing= True, weight_decay = 1e-4, shuffle=True, train_with_dev=True, mini_batch_size=2, maxi_batch_size=128, anneal_factor=0.5, patience=2, max_epochs=150 ): review_category = str(review_category) print('loading training corpus from %s'%(params.data_folder)) corpus: Corpus = ClassificationCorpus(params.data_folder, train_file= review_category+'_train.txt', test_file= review_category+'_test.txt', dev_file= review_category+'_dev.txt') label_dict = corpus.make_label_dictionary() print('labels: ',label_dict) if eval(params.transformer): print('initializing transformer document embeddings using %s ...'%(params.transformer_pretrain_lm)) # 3. initialize transformer document embeddings (many models are available) document_embeddings = TransformerDocumentEmbeddings(params.transformer_pretrain_lm, fine_tune=True) else: print('initializing document embeddings') word_embeddings= [ WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings(), # comment in these lines to use flair embeddings FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings(), # TransformerXLEmbeddings(), #RoBERTaEmbeddings(), #XLNetEmbeddings() ] # Can choose between many RNN types (GRU by default, to change use rnn_type parameter) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) if not update_model: print('building review_analysis classifier ...') # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # initialize the text classifier trainer print("initializing review_analysis classifier's trainer") trainer = ModelTrainer(classifier, corpus, optimizer=Adam) else: # continue trainer at later point checkpoint_path = params.checkpoint_dir+'/%s/checkpoint.pt'%(review_category) print('loading checkpoint from %s'%(checkpoint_path)) trainer = ModelTrainer.load_checkpoint(checkpoint_path, corpus) ####### training the model print("training the review_category: %s model ..."%(review_category)) try: trainer.train(params.checkpoint_dir+'/%s'%(review_category), learning_rate=learning_rate, embeddings_storage_mode=embeddings_storage_mode, checkpoint= checkpoint, batch_growth_annealing= batch_growth_annealing, weight_decay = weight_decay, shuffle=shuffle, train_with_dev=train_with_dev, mini_batch_size=mini_batch_size, maxi_batch_size=maxi_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs) except: print('chuncking batch ... by %d'%(params.mini_batch_chunk_size)) trainer.train(params.checkpoint_dir+'/%s'%(review_category), learning_rate=learning_rate, embeddings_storage_mode=embeddings_storage_mode, checkpoint= checkpoint, batch_growth_annealing= batch_growth_annealing, weight_decay = weight_decay, shuffle=shuffle, train_with_dev=train_with_dev, mini_batch_size=mini_batch_size, maxi_batch_size=maxi_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, mini_batch_chunk_size=params.mini_batch_chunk_size)
def main(args): X_train = np.load("{}/X_train.npy".format(args.data)) X_label_train = np.load("{}/X_label_train.npy".format(args.data)) Y_train = np.load("{}/Y_train.npy".format(args.data)) X_dev = np.load("{}/X_dev.npy".format( args.fixed_test if args.fixed_test else args.data)) X_label_dev = np.load("{}/X_label_dev.npy".format( args.fixed_test if args.fixed_test else args.data)) Y_dev = np.load("{}/Y_dev.npy".format( args.fixed_test if args.fixed_test else args.data)) token_label_embed = np.load("{}/token_labels_embed.npy".format( args.vocabularly)) sentence_label_embed = np.load("{}/sentence_labels_embed.npy".format( args.vocabularly)) word_embed = text.embedding.CustomEmbedding('{}/word_embed.txt'.format( args.vocabularly)) token_tag_map = json.load( open('{}/token_labels.json'.format(args.vocabularly), 'r')) vocab_map = json.load(open('{}/vocab.json'.format(args.vocabularly), 'r')) current_date_time = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") outputdir = ut.create_directories_per_series_des( args.data + '/LCAM_BioBERT{}{}_{}'.format( args.model_name, args.model_name2, current_date_time)) chkpointdir = ut.create_directories_per_series_des(args.data + '/checkpoints') sentence_tags = [ i.strip() for i in open('{}/sentence_labels.txt'.format(args.vocabularly), 'r').readlines() ] sentence_tag_map = dict([(k, v) for k, v in enumerate(sentence_tags)]) if args.transformer: print('Transfromer used for encoding the input sequence') transfomer_model = TransformerWordEmbeddings( model=args.transformer, layers=args.layer, subtoken_pooling=args.pooling, fine_tune=True) abs_transformer_model = TransformerDocumentEmbeddings( model=args.transformer, layers=args.layer, fine_tune=True) train_data = data_utils.TensorDataset( torch.from_numpy(X_train).type(torch.LongTensor), torch.from_numpy(X_label_train).type(torch.LongTensor), torch.from_numpy(Y_train).type(torch.LongTensor)) dev_data = data_utils.TensorDataset( torch.from_numpy(X_dev).type(torch.LongTensor), torch.from_numpy(X_label_dev).type(torch.LongTensor), torch.from_numpy(Y_dev).type(torch.LongTensor)) train_loader = data_utils.DataLoader(train_data, batch_size=64, drop_last=True) test_loader = data_utils.DataLoader(dev_data, batch_size=64, drop_last=True) word_embed = word_embed.idx_to_vec.asnumpy() word_embed = torch.from_numpy(word_embed).float() token_label_embed = torch.from_numpy(token_label_embed).float() sentence_label_embed = torch.from_numpy(sentence_label_embed).float() lwan_model = lwan.LabelWordAttention( drop_out=0.1, batch_size=64, emb_dim=300, trans_hdim=args.hidden_size, d_a=200, token_tag_map=token_tag_map, sentence_tag_map=sentence_tag_map, token_label_embeddings=token_label_embed, embeddings=word_embed) san_model = lwan.LabelSentenceAttention( lstm_hdim=args.hidden_size, emb_dim=300, drop_out=0.1, d_a=200, sentence_tag_map=sentence_tag_map, sentence_label_embeddings=sentence_label_embed) criterion = torch.nn.BCELoss() combined_params = list(lwan_model.parameters()) + list( san_model.parameters()) opt = torch.optim.Adam(combined_params, lr=0.001) train(lwan_model, transfomer_model, abs_transformer_model, san_model, train_loader, test_loader, criterion, opt, outputdir, chkpointdir, vocab_map, token_tag_map, sentence_tag_map, sent_pool='mean', epochs=10, abs_encoder=args.abs_encoder, GPU=True)
def initialize_training(text_column_index, label_column_index, delimiter=';', model_type=None, model=None, max_epochs=10, patience=3, use_amp=0, calc_class_weights=0): """ Create a text classification model using FLAIR, SentenceTransformers and Huggingface Transformers. Params: data_folder_path: Folder path with each file titled appropriately i.e. train.csv test.csv dev.csv. Will create a 80/10/10 split if only train is supplied. output_folder_path: Folder path for storing the best model & checkpoints. text_column_index: In which index (starting from 0) the input column is located. label_column_index: In which index (starting from 0) the label column is located. delimiter: type of delimiter used in the .csv file. model_type: SentenceTransformerDocumentEmbeddings or TransformerDocumentEmbeddings model: Which model to use. max_epochs: Number of epochs to train the model for. patience: Number of epochs without improvement before adjusting learning rate. use_amp: Whether to enable automatic mixed precisions (AMP). calc_class_weights: Whether to create a dictionary with class weights to deal with imbalanced datasets. Output: best-model.pt final-model.pt training.log """ # 1. Column format indicating which columns hold the text and label(s) column_name_map = { text_column_index: "text", label_column_index: "label_topic" } # 2. Load corpus containing training, test and dev data. corpus: Corpus = CSVClassificationCorpus("/root/text-classification/data/", column_name_map, skip_header=True, delimiter=delimiter) # Print statistics about the corpus. training_data_statistics = corpus.obtain_statistics() print(training_data_statistics) # 3A. Create a label dictionary. label_dict = corpus.make_label_dictionary() # 3B. Calculate class weights. if bool(calc_class_weights): weight_dict = create_weight_dict(delimiter=delimiter, label_index=label_column_index) else: weight_dict = None # 4. Initialize the sentence_transformers model. if model_type == "SentenceTransformerDocumentEmbeddings": document_embeddings = SentenceTransformerDocumentEmbeddings(model) elif model_type == "TransformerDocumentEmbeddings": document_embeddings = TransformerDocumentEmbeddings(model, fine_tune=True) elif model_type == "WordEmbeddings": word_embeddings = [WordEmbeddings(model)] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256) elif model_type == "StackedEmbeddings": document_embeddings = DocumentRNNEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings(model + '-backward'), FlairEmbeddings(model + '-forward') ]) else: raise Exception( "Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings." ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights=weight_dict) # 6. initialize the text classifier trainer with Adam optimizer trainer = ModelTrainer(classifier, corpus, optimizer=Adam, use_tensorboard=False) # 7. start the training trainer.train("/root/text-classification/checkpoint/", learning_rate=3e-5, max_epochs=max_epochs, patience=patience, use_amp=bool(use_amp), checkpoint=True, mini_batch_size=16, mini_batch_chunk_size=4)
from flair.data import Sentence from flair.embeddings import TransformerDocumentEmbeddings import pandas as pd import numpy as np import sys text_col = "review_text" id_col = "row_id" file_name = sys.argv[1] batch_size = int(sys.argv[2]) df = pd.read_csv(file_name) embedding = TransformerDocumentEmbeddings('bert-base-uncased') outs = list() df['batch'] = np.arange(len(df)) // batch_size for b in df['batch'].unique(): print(b) current_batch = df[df['batch'] == b] out = current_batch[text_col].apply(lambda k: pd.Series( embedding.embed(Sentence(k))[0].embedding.tolist())) out = pd.concat([current_batch[id_col], out], axis=1) outs.append(out) outs = pd.concat(outs) outs.columns = [id_col] + ['emb_' + str(c) for c in outs.columns[1:]] outs.to_csv("embeddings.csv")
def load_bert_embeddings(ename): # See BERT paper, section 5.3 and table 7 for layers return TransformerDocumentEmbeddings(ename, layers='-1,-2,-3,-4')
def generate_embeddings(docs, batch_size, model_name='bert-base-cased', pooling='mean', offset=0): """ Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and returns a list tuple. The first element represents failure (0) or success (1 or 2) and the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch if unsuccessful. The first element is 1, if batch_size embeddings were created :param docs: a list of strings for which embeddings should be created :param batch_size: integer representing how many embeddings should be created at once :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base :param pooling: the pooling strategy to generate Document Embeddings :param offset: the offset of the integers, for printing out the correct index :return: a tuple (success/failure, embeddings/failed_indices) """ rest = len(docs) % batch_size model = False if pooling == 'mean': embedding = TransformerWordEmbeddings(model_name, layers='-1', allow_long_sentences=True) model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none') elif pooling == 'CLS': model = TransformerDocumentEmbeddings(model_name) if model: for i in range(0, len(docs) - rest, batch_size): sentences = [ Sentence(sentence) for sentence in docs[i:i + batch_size] ] try: model.embed(sentences) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: sentences = [Sentence(sentence) for sentence in docs[-rest:]] try: model.embed(sentences) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: yield 0, (len(docs) - rest, 0) elif pooling == 'SentenceBert': model = SentenceTransformer(model_name) for i in range(0, len(docs) - rest, batch_size): try: embeddings = model.encode(docs[i:i + batch_size]) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, embeddings except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: try: embeddings = model.encode(docs[-rest:]) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, embeddings except RuntimeError: yield 0, (len(docs) - rest, 0) else: raise Exception("No Valid model")
import os import joblib from flair.data import Sentence from flair.embeddings import TransformerDocumentEmbeddings from mair.data_loading import load_legal_documents from tqdm import tqdm OUT_DIR = "data/processed" OUT_FILE = "docs-bert-embeddings.joblib" out_path = os.path.join(OUT_DIR, OUT_FILE) embedder = TransformerDocumentEmbeddings("roberta-base") os.makedirs(OUT_DIR, exist_ok=True) def get_bert_embedding(text): sent = Sentence(text) sent = embedder.embed(sent)[0] return sent.embedding data = load_legal_documents() embeddings = dict() for p, text in tqdm(data.items()): embeddings[p] = get_bert_embedding(text) joblib.dump(embeddings, out_path)
def train_classifier(pre_trained_model, layer, lr, batch_size, pooling_sub_token, epochs, hidden_size, word_level=False, task='text_classification'): # corpus = NLPTaskDataFetcher.load_classification_corpus(data_folder='label_embs_2/', test_file='test.csv', train_file='train.csv', dev_file='dev.csv') if not word_level: document_embeddings = TransformerDocumentEmbeddings(pre_trained_model, fine_tune=True) else: token_embeddings = TransformerWordEmbeddings( pre_trained_model, layers=layer, pooling_operation=pooling_sub_token, fine_tune=True) #text classification if task == 'text_classification': corpus: Corpus = ClassificationCorpus(data_folder=dataset_folder, test_file='test.txt', dev_file='dev.txt', train_file='train.txt') label_dict = corpus.make_label_dictionary() classifier = TextClassifier(document_embeddings=token_embeddings, label_dictionary=label_dict, multi_label=False) # trainer = ModelTrainer(model=classifier, corpus=corpus, optimizer=SGD) #sequence labelling elif task == 'sequence_labelling': columns = {0: 'text', 1: 'tag'} corpus: Corpus = ColumnCorpus(dataset_folder, columns, train_file='train.txt', test_file='test.txt', dev_file='dev.txt') token_tag_dictionary = corpus.make_tag_dictionary(tag_type=columns[1]) embedding_types = [ TransformerWordEmbeddings(pre_trained_model, layers=layer, pooling_operation=pooling_sub_token, fine_tune=True) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) classifier: SequenceTagger = SequenceTagger( hidden_size=hidden_size, embeddings=embeddings, tag_dictionary=token_tag_dictionary, tag_type=columns[1], use_crf=True) trainer: ModelTrainer = ModelTrainer(model=classifier, corpus=corpus, optimizer=SGD) trainer.train(dest_folder + '/{}-output'.format(task), learning_rate=lr, mini_batch_size=batch_size, max_epochs=epochs)
def __init__(self, word_embedding_base: str = None, document_embedding: str = None, fine_tune: bool = False, pretuned: bool = False): """ :param word_embedding_base: - glove: 'glove', (only en), - fasttext: 'en', 'de' :param document_embedding: pool vs rnn for w2v mode - bert: 'bert', 'bert-de' - 'longformer' (only en) - 'flair', 'stacked-flair', 'flair-de', 'stacked-flair-de' """ # document embedding self.fine_tune = fine_tune self.document_embedding = None if word_embedding_base: self.word_embedding_base = WordEmbeddings(word_embedding_base) if document_embedding.lower() == 'pool': self.document_embedding = DocumentPoolEmbeddings( [self.word_embedding_base]) elif document_embedding.lower() == 'rnn': self.document_embedding = DocumentRNNEmbeddings( [self.word_embedding_base]) else: raise UserWarning( f'{document_embedding} is not supported for combination with word embeedings' ) elif document_embedding: print(document_embedding, pretuned) if pretuned: if document_embedding.lower( ) == 'bert' or document_embedding.lower() == 'bert-de': self.document_embedding = SentenceTransformer( 'stsb-bert-large') # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-bert-large') elif document_embedding.lower() == 'roberta': self.document_embedding = SentenceTransformer( 'stsb-roberta-large') # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-roberta-large') elif document_embedding.lower() == 'xlm': self.document_embedding = SentenceTransformer( 'stsb-xlm-r-multilingual') # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-xlm-r-multilingual') else: if document_embedding.lower() == 'bert': self.document_embedding = TransformerDocumentEmbeddings( 'bert-base-cased', fine_tune=fine_tune) elif document_embedding.lower() == 'bert-de': self.document_embedding = TransformerDocumentEmbeddings( 'bert-base-german-cased', fine_tune=fine_tune) elif document_embedding.lower() == 'longformer': self.document_embedding = TransformerDocumentEmbeddings( 'allenai/longformer-base-4096', fine_tune=fine_tune) elif document_embedding.lower() == 'xlnet': self.document_embedding = TransformerDocumentEmbeddings( 'xlnet-base-cased', fine_tune=fine_tune) elif document_embedding.lower() == 'xlnet-de': self.document_embedding = TransformerDocumentEmbeddings( 'xlm-mlm-ende-1024', fine_tune=fine_tune) elif document_embedding.lower() == 'flair': self.document_embedding = FlairEmbeddings( 'en-forward', fine_tune=fine_tune) elif document_embedding.lower() == 'flair-de': self.document_embedding = FlairEmbeddings( 'de-forward', fine_tune=fine_tune) elif document_embedding.lower() == 'stack-flair': self.document_embedding = StackedEmbeddings([ FlairEmbeddings('en-forward'), FlairEmbeddings('en-backward'), ]) elif document_embedding.lower() == 'stack-flair-de': self.document_embedding = StackedEmbeddings([ FlairEmbeddings('de-forward'), FlairEmbeddings('de-backward'), ]) else: raise UserWarning(f'No embeddings defined')
def from_transformers(cls) -> 'SimpleFeaturizer': return cls(TransformerDocumentEmbeddings())
# 1. get the corpus column_name_map = {0: config["label_name"], 1: "text"} corpus: Corpus = CSVClassificationCorpus( config["data_folder"], column_name_map, skip_header=True, delimiter='\t', # tab-separated files ) print(corpus) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() class_weights = utils.get_inverted_class_balance(corpus.train.dataset) # 3. initialize transformer document embeddings (many models are available) document_embeddings = TransformerDocumentEmbeddings( 'allenai/scibert_scivocab_uncased', fine_tune=True) # 4. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights=class_weights) # 5. initialize the text classifier trainer with Adam optimizer trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # 6. start the training trainer.train( sys.argv[2], learning_rate=3e-5, # use very small learning rate mini_batch_size=16, mini_batch_chunk_size=
if not fine_tune: # define the RNN model document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, rnn_type="LSTM", bidirectional=True, rnn_layers=2, ) # Case 2: fine-tune transformer model and use CLS output else: transformer_model = "roberta-large" document_embeddings = TransformerDocumentEmbeddings( model=transformer_model, fine_tune=True) # define the neural classifier classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) # define the training regime for model+RNN if not fine_tune: # train model trainer = ModelTrainer(classifier, corpus) trainer.train( base_path="{}".format(path2[i]), max_epochs=epochs,
class CompanyMatcher: companies: Dict[str, Company] = {} embedding: Union[TransformerDocumentEmbeddings, SentenceTransformerDocumentEmbeddings] description_embeddings: Dict[str, torch.tensor] = {} embeddings_pkl_file_path: Path = Path("description-embeddings.pkl") geolocator: Nominatim = Nominatim(user_agent="company-matcher") headquarters_locations: Dict[str, Tuple[float, float]] = {} locations_pkl_file_path: Path = Path("headquarters-locations.pkl") _similarity_component_weights: Dict[str, float] = { "description": 0.6, "founded": 0.2, "headquarters": 0.2 } _founding_year_normalizer: int = None _hq_location_normalizer: int = 20000 # half of earth diameter # largest distance between two headquarters takes two long to compute # (at least with the inefficient method implemented in _calc_hq_location_normalizer def __init__(self, company_file_path: Path, transformer_model: str, sentence_transformer: bool = False, similarity_component_weights: Dict[str, float] = None, load_n: int = None): self._load_transformer_model(transformer_model, sentence_transformer) self._load_companies(company_file_path, load_n) self._embed_descriptions() self._locate_headquarters() self._load_similarity_component_weights(similarity_component_weights) self._calc_founding_year_normalizer() def _load_similarity_component_weights( self, similarity_component_weights: Dict[str, float]): if similarity_component_weights: if {"description", "founded", "headquarters" } == set(self._similarity_component_weights.keys()): self._similarity_component_weights = similarity_component_weights else: logger.warning( f"Invalid similarity component weights gives: {similarity_component_weights}" ) logger.warning("Using default values!") def _load_transformer_model(self, transformer_model: str, sentence_transformer: bool): logger.info("Loading transformer model...") if sentence_transformer: try: self.embedding = SentenceTransformerDocumentEmbeddings( transformer_model, ) except OSError as e: logger.error("Could not load transformer model: " + str(e)) exit() else: try: self.embedding = TransformerDocumentEmbeddings( transformer_model, fine_tune=False) except OSError as e: logger.error("Could not load sentence transformer model: " + str(e)) exit() logger.info("Done loading transformer model!") def _load_companies(self, company_file_path: Path, load_n: int): logger.info("Loading company data from file...") try: json_data = load_json(company_file_path) except OSError as e: logger.error("Could not company data file: " + str(e)) exit() if 0 < load_n <= len(json_data): json_data = json_data[:load_n] try: companies_list = [Company(**entry) for entry in json_data] except ValidationError as e: logger.error("Company data does not follow valid format: " + str(e)) exit() try: companies_url_list = [c.url for c in companies_list] assert len(companies_url_list) == len(set(companies_url_list)) except AssertionError: logger.warning("Company URLs are not unique!") # check which which companies are duplicates duplicate_company_urls = [] for company in companies_list: if self.companies.get(company.url, None): duplicate_company_urls.append(company.url) else: self.companies[company.url] = company logger.warning( f"Following company URLs have multiple entries: {duplicate_company_urls}" ) logger.warning("Duplicate entries will be ignored!") logger.info("Done loading company data!") def _embed_descriptions(self, chunk_size: int = 30, load_from_pickle: bool = True, save_to_pickle: bool = True): if load_from_pickle: self.description_embeddings = load_pickle( self.embeddings_pkl_file_path, error_msg="Could not load stored embeddings!") descriptions_ = [(company.url, Sentence(company.description)) for company in self.companies.values() if company.url not in self.description_embeddings] # chunking for progress bar if descriptions_: logger.info("Computing description embeddings...") with tqdm(total=len(descriptions_)) as pbar: for start_idx in range(0, len(descriptions_), chunk_size): end_idx = start_idx + chunk_size if not end_idx < len(descriptions_): end_idx = len(descriptions_) chunk_size = end_idx - start_idx descriptions_chunk = descriptions_[start_idx:end_idx] self.embedding.embed([ description_[1] for description_ in descriptions_chunk ]) self.description_embeddings.update({ description_[0]: description_[1].embedding for description_ in descriptions_chunk }) # remove embedding from sentence objects for _, description_sentence in descriptions_chunk: description_sentence.clear_embeddings() if save_to_pickle: save_pickle( object_=self.description_embeddings, pkl_file_path=self.embeddings_pkl_file_path, error_msg="Could not save new embeddings!") pbar.update(chunk_size) # DEBUGGING # # snapshot = tracemalloc.take_snapshot() # display_top_malloc_lines(snapshot) # --------- # logger.info("Done computing description embeddings!") def _locate_headquarters(self, load_from_pickle: bool = True, save_to_pickle: bool = True): if load_from_pickle: self.headquarters_locations = load_pickle( self.locations_pkl_file_path, error_msg="Could not load stored locations!") not_located_companies = [ company_url for company_url in self.companies if company_url not in self.headquarters_locations ] # not_located_companies = [] # DEBUGGING if not_located_companies: logger.info("Geo-locating company headquarters...") for company_url in not_located_companies: company = self.companies[company_url] if company.headquarters and not self.headquarters_locations.get( company_url, None): location = None try: location = self.geolocator.geocode( company.headquarters) except (MaxRetryError, GeocoderUnavailable): pass if location: self.headquarters_locations[company_url] = ( location.latitude, location.longitude) if save_to_pickle: save_pickle( object_=self.headquarters_locations, pkl_file_path=self.locations_pkl_file_path, error_msg="Could not save locations!") logger.info("Done locating company headquarters!") def _calc_founding_year_normalizer(self): company_founding_years = [ company.founded for company in self.companies.values() if company.founded ] min_founding_year = min(company_founding_years) max_founding_year = max(company_founding_years) self._founding_year_normalizer = max_founding_year - min_founding_year # function is too inefficient for datasets with more than a few hundred entries # def _calc_hq_location_normalizer(self): # location_pairs = combinations(self.headquarters_locations.values(), 2) # location_distances = [geodesic(location_pair[0], location_pair[1]).kilometers # for location_pair in location_pairs] # self._hq_location_normalizer = max(location_distances) # -----------------------------# # --- Similarity functions --- # def _description_similarities(self, query_company: Company): query_embedding = self.description_embeddings[query_company.url] return { candidate_url: float(cos_similarity(query_embedding, candidate_embedding)) for candidate_url, candidate_embedding in self.description_embeddings.items() if candidate_url != query_company.url } def _founded_similarities(self, query_company: Company): return { company.url: self._calc_founded_similarity(query_company, company) for company in self.companies.values() if company.url != query_company.url } def _calc_founded_similarity( self, query_company: Company, candidate_company: Company) -> Union[float, None]: if query_company.founded and candidate_company.founded: return 1 - abs(query_company.founded - candidate_company.founded ) / float(self._founding_year_normalizer) else: return None def _headquarters_similarities(self, query_company: Company): return { company.url: self._calc_headquarters_similarity(query_company, company) for company in self.companies.values() if company.url != query_company.url } def _calc_headquarters_similarity( self, query_company: Company, candidate_company: Company) -> Union[float, None]: query_location = self.headquarters_locations.get( query_company.url, None) candidate_location = self.headquarters_locations.get( candidate_company.url, None) if query_location and candidate_location: return 1 - geodesic(query_location, candidate_location).kilometers / float( self._hq_location_normalizer) else: return None def get_peers(self, query_url: str, top_k: int = 10) -> Union[List[str], None]: if query_url not in self.companies: logger.warning(f"Company with URL '{query_url}' does not exist!") return None query_company = self.companies[query_url] description_similarities = self._description_similarities( query_company) founded_similarities = self._founded_similarities(query_company) headquarters_similarities = self._headquarters_similarities( query_company) similarities = [] for company_url in description_similarities: if founded_similarities[company_url] and headquarters_similarities[ company_url]: similarity = ( self._similarity_component_weights['description'] * description_similarities[company_url] + self._similarity_component_weights['founded'] * founded_similarities[company_url] + self._similarity_component_weights['headquarters'] * headquarters_similarities[company_url]) elif founded_similarities[ company_url] and not headquarters_similarities[company_url]: weight_normalizer = self._similarity_component_weights[ 'description'] + self._similarity_component_weights[ 'founded'] similarity = (self._similarity_component_weights['description'] * description_similarities[company_url] + self._similarity_component_weights['founded'] * founded_similarities[company_url] ) / float(weight_normalizer) elif not founded_similarities[ company_url] and headquarters_similarities[company_url]: weight_normalizer = self._similarity_component_weights[ 'description'] + self._similarity_component_weights[ 'headquarters'] similarity = ( self._similarity_component_weights['description'] * description_similarities[company_url] + self._similarity_component_weights['headquarters'] * headquarters_similarities[company_url] ) / float(weight_normalizer) else: weight_normalizer = self._similarity_component_weights['description'] \ + self._similarity_component_weights['founded'] + self._similarity_component_weights['headquarters'] similarity = description_similarities[company_url] / float( weight_normalizer) similarities.append((company_url, similarity)) sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True) return [x[0] for x in sorted_similarities][:top_k]
def test_transformer_document_embeddings(): embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased') sentence: Sentence = Sentence("I love Berlin") embeddings.embed(sentence) assert len(sentence.get_embedding()) == 768 sentence.clear_embeddings() assert len(sentence.get_embedding()) == 0 embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', layers='all') embeddings.embed(sentence) assert len(sentence.get_embedding()) == 5376 sentence.clear_embeddings() assert len(sentence.get_embedding()) == 0 embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', layers='all', layer_mean=True) embeddings.embed(sentence) assert len(sentence.get_embedding()) == 768 sentence.clear_embeddings() del embeddings
corpus.filter_empty_sentences() print(corpus) label_dictionary = corpus.make_label_dictionary() print(label_dictionary) flat_labels = [item for sublist in labels for item in sublist] class_weights = compute_class_weight('balanced', np.unique(flat_labels), flat_labels) unique_labels = np.unique(flat_labels) weights = {} for i in range(len(unique_labels)): weights[unique_labels[i]] = class_weights[i] document_embeddings = TransformerDocumentEmbeddings( params['version_model'], fine_tune=True) classifier = TextClassifier(document_embeddings, label_dictionary=label_dictionary, loss_weights=weights) trainer = ModelTrainer(classifier, corpus) trainer.train(params['model_dir'], learning_rate=params['learning_rate'], mini_batch_size=params['batch_size'], anneal_factor=params['anneal_factor'], patience=params['patience'], max_epochs=params['epochs'], embeddings_storage_mode=params['embeddings_storage_mode'])