Exemple #1
0
 def embedize(self, data_subset_list):
     tweet = Sentence(data_subset_list)
     embedding = TransformerDocumentEmbeddings(self.embedding)
     embedding.embed(tweet)
     tweet_emb = tweet.get_embedding()
     tweet_emb_np = tweet_emb.detach().numpy()
     return (tweet_emb_np)
Exemple #2
0
class Embedding:
    """
    Performs embedding on sentences.
    """

    def __init__(self, model='gpt2-medium'):
        """
        Initializes the embedding model.

        :param {str} model - The model architecture. Must be one of
        https://huggingface.co/transformers/pretrained_models.html
        """
        self.model = TransformerDocumentEmbeddings(model, batch_size=8)

    def embed(self, sentence: str) -> list:
        """
        Embeds a given sentence. If it fails, returns None.

        :param {str} sentence - A cased or uncased sentence.
        """
        if isinstance(sentence, bytes):
            sentence = sentence.decode('ascii')

        if isinstance(sentence, list):
            sentence = ' '.join(sentence)

        if sentence == '':
            return None

        try:
            sent = Sentence(sentence)
            self.model.embed(sent)
            return sent.embedding.detach().cpu().numpy()
        except TypeError:
            return None
Exemple #3
0
    def __init__(self, model='gpt2-medium'):
        """
        Initializes the embedding model.

        :param {str} model - The model architecture. Must be one of
        https://huggingface.co/transformers/pretrained_models.html
        """
        self.model = TransformerDocumentEmbeddings(model, batch_size=8)
Exemple #4
0
def create_embeddings_flair(data: pd.DataFrame,
                            column: str = "text",
                            path: str = None,
                            embeddings_type: str = "tranformer",
                            typs: str = "train"):
    assert column in data.columns.tolist(
    ), "[embeddings.py] -> [create_embedding_flair] -> Input column not in dataframe columns"
    assert embeddings_type in ["tranformer", "stacked"]

    from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings
    from flair.data import Sentence

    fast_text_embedding = WordEmbeddings('de')
    flair_embedding_forward = FlairEmbeddings('de-forward')
    flair_embedding_backward = FlairEmbeddings('de-backward')

    stacked_embeddings = DocumentPoolEmbeddings([
        fast_text_embedding, flair_embedding_forward, flair_embedding_backward
    ])

    transformer_embedding = TransformerDocumentEmbeddings(
        'bert-base-german-cased', fine_tune=False)

    tic = time.time()

    embeddings = []

    for i, text in enumerate(data[column].values):
        print("sentence {}/{}".format(i, len(data)))
        sentence = Sentence(text)

        if embeddings_type == "stacked":
            stacked_embeddings.embed(sentence)
        elif embeddings_type == "tranformer":
            transformer_embedding.embed(sentence)

        embedding = sentence.embedding.detach().cpu().numpy()
        embeddings.append(embedding)

    embeddings = np.array(embeddings)

    columns = [
        "embedding_{}".format(feature)
        for feature in range(embeddings.shape[1])
    ]

    csv = pd.DataFrame(embeddings, columns=columns)
    csv.to_csv(path + embeddings_type + "_" + typs + ".csv", index=False)

    toc = time.time()

    print(
        "[create_embeddings_flair] -> [embeddings_type: {}, typs: {}] -> time {}'s"
        .format(embeddings_type, typs, toc - tic))
Exemple #5
0
    def vectorize(self, X):
        # init embedding model
        print(f"Load {self.model_name} model ...")
        model = TransformerDocumentEmbeddings(self.model_name, fine_tune=False)

        # convert to Sentence objects
        print("Convert to Sentence objects ...")
        X = X.str.lower()
        sentences = X.progress_apply(lambda x: Sentence(x))

        # get vectors from BERT
        print(f"Get {self.model_name} embeddings ...")
        docvecs = sentences.progress_apply(lambda x: model.embed(x))
        docvecs = sentences.progress_apply(lambda x: x.embedding.cpu().numpy())
        return list(docvecs)
Exemple #6
0
def test_text_classifier_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=TransformerDocumentEmbeddings(
            "distilbert-base-uncased"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-5,
        num_workers=2,
    )

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict([sentence, sentence_empty])

    values = []
    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Exemple #7
0
    def __init__(
            self,
            task_name: str,
            label_dictionary: Dictionary,
            label_type: str,
            embeddings: str = 'bert-base-uncased',
            num_negative_labels_to_sample: int = 2,
            prefix: bool = True,
            **tagger_args,
    ):
        """
        Initializes a TextClassifier
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of labels you want to predict
        :param embeddings: name of the pre-trained transformer model e.g.,
        'bert-base-uncased' etc
        :param num_negative_labels_to_sample: number of negative labels to sample for each
        positive labels against a sentence during training. Defaults to 2 negative
        labels for each positive label. The model would sample all the negative labels
        if None is passed. That slows down the training considerably.
        :param multi_label: auto-detected by default, but you can set this to True
        to force multi-label predictionor False to force single-label prediction
        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
        :param beta: Parameter for F-beta score for evaluation and training annealing
        """
        super(TARSClassifier, self).__init__()

        from flair.embeddings import TransformerDocumentEmbeddings

        if not isinstance(embeddings, TransformerDocumentEmbeddings):
            embeddings = TransformerDocumentEmbeddings(model=embeddings,
                                                       fine_tune=True,
                                                       layers='-1',
                                                       layer_mean=False,
                                                       )

        # prepare TARS dictionary
        tars_dictionary = Dictionary(add_unk=False)
        tars_dictionary.add_item('False')
        tars_dictionary.add_item('True')

        # initialize a bare-bones sequence tagger
        self.tars_model = TextClassifier(document_embeddings=embeddings,
                                         label_dictionary=tars_dictionary,
                                         label_type=self.static_label_type,
                                         **tagger_args,
                                         )

        # transformer separator
        self.separator = str(self.tars_embeddings.tokenizer.sep_token)
        if self.tars_embeddings.tokenizer._bos_token:
            self.separator += str(self.tars_embeddings.tokenizer.bos_token)

        self.prefix = prefix
        self.num_negative_labels_to_sample = num_negative_labels_to_sample

        # Store task specific labels since TARS can handle multiple tasks
        self.add_and_switch_to_new_task(task_name, label_dictionary, label_type)
Exemple #8
0
 def _load_transformer_model(self, transformer_model: str,
                             sentence_transformer: bool):
     logger.info("Loading transformer model...")
     if sentence_transformer:
         try:
             self.embedding = SentenceTransformerDocumentEmbeddings(
                 transformer_model, )
         except OSError as e:
             logger.error("Could not load transformer model: " + str(e))
             exit()
     else:
         try:
             self.embedding = TransformerDocumentEmbeddings(
                 transformer_model, fine_tune=False)
         except OSError as e:
             logger.error("Could not load sentence transformer model: " +
                          str(e))
             exit()
     logger.info("Done loading transformer model!")
    def generate_doc_embedding(document: str,
                               embeddings: list,
                               doc2vec="transformer_roberta"):
        doc_embedding: np.ndarray = np.array([])
        try:
            logger.info("Generating embedding for document .... ")
            # 1. Initialise Document Embedding

            # a) Pooling
            if doc2vec == "pool":
                document_embeddings: DocumentPoolEmbeddings = DocumentPoolEmbeddings(
                    embeddings=embeddings)
            elif doc2vec == "rnn":
                document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
                    embeddings=embeddings, hidden_size=256, rnn_type='LSTM')

            # b) Transformer
            elif doc2vec == "transformer_bert":
                document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings(
                    'bert-base-multilingual-cased')
            else:
                document_embeddings: TransformerDocumentEmbeddings = TransformerDocumentEmbeddings(
                    'roberta-base')

            # 2. Create an example sentence
            sentence: Sentence = Sentence(document)

            # 3. Embed the sentence with our document embedding
            document_embeddings.embed(sentence)

            # 4. Save embedding into CPU
            if "cuda" in str(flair.device).lower():
                doc_emb_cpu: Tensor = sentence.embedding.cpu()
                # 5. Convert to numpy array
                doc_embedding: np.ndarray = doc_emb_cpu.detach().numpy()
            else:
                doc_embedding: np.ndarray = sentence.get_embedding().detach(
                ).numpy()
        except Exception as e:
            logger.error(e)
        return doc_embedding
Exemple #10
0
    def _set_up_model(self, params: dict):
        text_classification_params = {
            key: params[key]
            for key in params if key in TEXT_CLASSIFICATION_PARAMETERS
        }

        document_embedding = TransformerDocumentEmbeddings(
            fine_tune=self.fine_tune, **text_classification_params)

        text_classifier: TextClassifier = TextClassifier(
            label_dictionary=self.label_dictionary,
            multi_label=self.multi_label,
            label_type=self.label_type,
            document_embeddings=document_embedding,
        )

        return text_classifier
    def _set_up_model(self, params: dict, label_dictionary):
        document_embedding = params['document_embeddings'].__name__
        if document_embedding == "DocumentRNNEmbeddings":
            embedding_params = {
                key: params[key]
                for key, value in params.items()
                if key in DOCUMENT_RNN_EMBEDDING_PARAMETERS
            }
            embedding_params['embeddings'] = [
                WordEmbeddings(TokenEmbedding) if type(params['embeddings'])
                == list else WordEmbeddings(params['embeddings'])
                for TokenEmbedding in params['embeddings']
            ]
            document_embedding = DocumentRNNEmbeddings(**embedding_params)

        elif document_embedding == "DocumentPoolEmbeddings":
            embedding_params = {
                key: params[key]
                for key, value in params.items()
                if key in DOCUMENT_POOL_EMBEDDING_PARAMETERS
            }
            embedding_params['embeddings'] = [
                WordEmbeddings(TokenEmbedding)
                for TokenEmbedding in params['embeddings']
            ]
            document_embedding = DocumentPoolEmbeddings(**embedding_params)

        elif document_embedding == "TransformerDocumentEmbeddings":
            embedding_params = {
                key: params[key]
                for key, value in params.items()
                if key in DOCUMENT_TRANSFORMER_EMBEDDING_PARAMETERS
            }
            document_embedding = TransformerDocumentEmbeddings(
                **embedding_params)

        else:
            raise Exception("Please provide a flair document embedding class")

        text_classifier: TextClassifier = TextClassifier(
            label_dictionary=label_dictionary,
            multi_label=self.multi_label,
            document_embeddings=document_embedding,
        )
        return text_classifier
Exemple #12
0
def load_model(bert=None, document=False, flair=False):
    """Load word embeddings model."""
    if bert == 'bio':
        # https://github.com/flairNLP/flair/issues/1085
        # also see readme for instructions
        bertpath = './bert/bert-base-biobert-cased'
    elif bert == 'sci':
        # https://github.com/flairNLP/flair/issues/744
        # https://github.com/flairNLP/flair/issues/1239
        bertpath = './bert/scibert_scivocab_uncased'
    else:
        bertpath = 'bert-base-uncased'

    if document and not flair:
        bert_embedding = TransformerDocumentEmbeddings(model=bertpath,
                                                       batch_size=4)
        return bert_embedding

    bert_embedding = TransformerWordEmbeddings(model=bertpath,
                                               pooling_operation='first',
                                               batch_size=4)

    if flair:
        flair_embedding_forward = FlairEmbeddings('en-forward')
        flair_embedding_backward = FlairEmbeddings('en-backward')
        embed_arr = [
            bert_embedding,
            flair_embedding_backward,
            flair_embedding_forward,
        ]
    else:
        embed_arr = [bert_embedding]

    if document:
        document_embeddings = DocumentPoolEmbeddings(
            embed_arr, fine_tune_mode='nonlinear')
    else:
        document_embeddings = StackedEmbeddings(embed_arr)

    return document_embeddings
)

# print the number of Sentences in the train split
print(len(corpus.train))

# print the number of Sentences in the test split
print(len(corpus.test))

# print the number of Sentences in the dev split
print(len(corpus.dev))

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

# 3. initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased',
                                                    fine_tune=True)

# 4. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# 5. initialize the text classifier trainer with Adam optimizer
trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

# 6. start the training
trainer.train(
    './model_result',
    learning_rate=3e-5,  # use very small learning rate
    mini_batch_size=16,
    mini_batch_chunk_size=
    4,  # optionally set this if transformer is too much for your machine
    max_epochs=5,  # terminate after 5 epochs
Exemple #14
0
def train(
    review_category,
    params,
    update_model= False,
    learning_rate=0.01,
    embeddings_storage_mode='gpu',
    checkpoint= True,
    batch_growth_annealing= True,
    weight_decay = 1e-4,
    shuffle=True,
    train_with_dev=True,
    mini_batch_size=2,
    maxi_batch_size=128,
    anneal_factor=0.5,
    patience=2,
    max_epochs=150
    ):
    review_category = str(review_category)
    print('loading training corpus from %s'%(params.data_folder))
    corpus: Corpus = ClassificationCorpus(params.data_folder,
                train_file= review_category+'_train.txt',
                test_file= review_category+'_test.txt',
                dev_file= review_category+'_dev.txt')
    label_dict = corpus.make_label_dictionary()
    print('labels: ',label_dict)
    if eval(params.transformer):
        print('initializing transformer document embeddings using %s ...'%(params.transformer_pretrain_lm))
        # 3. initialize transformer document embeddings (many models are available)
        document_embeddings = TransformerDocumentEmbeddings(params.transformer_pretrain_lm, fine_tune=True)
    else:
        print('initializing document embeddings')
        word_embeddings= [
            WordEmbeddings('glove'),
            # comment in this line to use character embeddings
            CharacterEmbeddings(),
            # comment in these lines to use flair embeddings
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
            BertEmbeddings(),
            # TransformerXLEmbeddings(),
            #RoBERTaEmbeddings(),
            #XLNetEmbeddings()
        ]
        # Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
        document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                    hidden_size=512,
                                                    reproject_words=True,
                                                    reproject_words_dimension=256,
                                                    )
    if not update_model:
        print('building review_analysis classifier ...')
        # create the text classifier
        classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
        # initialize the text classifier trainer
        print("initializing review_analysis classifier's trainer")
        trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
    else:
        # continue trainer at later point
        checkpoint_path = params.checkpoint_dir+'/%s/checkpoint.pt'%(review_category)
        print('loading checkpoint from %s'%(checkpoint_path))
        trainer = ModelTrainer.load_checkpoint(checkpoint_path, corpus)
    ####### training the model
    print("training the review_category: %s model ..."%(review_category))
    try:
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs)
    except:
        print('chuncking batch ... by %d'%(params.mini_batch_chunk_size))
        trainer.train(params.checkpoint_dir+'/%s'%(review_category),
        learning_rate=learning_rate,
        embeddings_storage_mode=embeddings_storage_mode,
        checkpoint= checkpoint,
        batch_growth_annealing= batch_growth_annealing,
        weight_decay = weight_decay,
        shuffle=shuffle,
        train_with_dev=train_with_dev,
        mini_batch_size=mini_batch_size,
        maxi_batch_size=maxi_batch_size,
        anneal_factor=anneal_factor,
        patience=patience,
        max_epochs=max_epochs,
        mini_batch_chunk_size=params.mini_batch_chunk_size)
Exemple #15
0
def main(args):
    X_train = np.load("{}/X_train.npy".format(args.data))
    X_label_train = np.load("{}/X_label_train.npy".format(args.data))
    Y_train = np.load("{}/Y_train.npy".format(args.data))
    X_dev = np.load("{}/X_dev.npy".format(
        args.fixed_test if args.fixed_test else args.data))
    X_label_dev = np.load("{}/X_label_dev.npy".format(
        args.fixed_test if args.fixed_test else args.data))
    Y_dev = np.load("{}/Y_dev.npy".format(
        args.fixed_test if args.fixed_test else args.data))
    token_label_embed = np.load("{}/token_labels_embed.npy".format(
        args.vocabularly))
    sentence_label_embed = np.load("{}/sentence_labels_embed.npy".format(
        args.vocabularly))
    word_embed = text.embedding.CustomEmbedding('{}/word_embed.txt'.format(
        args.vocabularly))
    token_tag_map = json.load(
        open('{}/token_labels.json'.format(args.vocabularly), 'r'))
    vocab_map = json.load(open('{}/vocab.json'.format(args.vocabularly), 'r'))
    current_date_time = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    outputdir = ut.create_directories_per_series_des(
        args.data + '/LCAM_BioBERT{}{}_{}'.format(
            args.model_name, args.model_name2, current_date_time))
    chkpointdir = ut.create_directories_per_series_des(args.data +
                                                       '/checkpoints')
    sentence_tags = [
        i.strip()
        for i in open('{}/sentence_labels.txt'.format(args.vocabularly),
                      'r').readlines()
    ]
    sentence_tag_map = dict([(k, v) for k, v in enumerate(sentence_tags)])

    if args.transformer:
        print('Transfromer used for encoding the input sequence')
        transfomer_model = TransformerWordEmbeddings(
            model=args.transformer,
            layers=args.layer,
            subtoken_pooling=args.pooling,
            fine_tune=True)
        abs_transformer_model = TransformerDocumentEmbeddings(
            model=args.transformer, layers=args.layer, fine_tune=True)

    train_data = data_utils.TensorDataset(
        torch.from_numpy(X_train).type(torch.LongTensor),
        torch.from_numpy(X_label_train).type(torch.LongTensor),
        torch.from_numpy(Y_train).type(torch.LongTensor))

    dev_data = data_utils.TensorDataset(
        torch.from_numpy(X_dev).type(torch.LongTensor),
        torch.from_numpy(X_label_dev).type(torch.LongTensor),
        torch.from_numpy(Y_dev).type(torch.LongTensor))

    train_loader = data_utils.DataLoader(train_data,
                                         batch_size=64,
                                         drop_last=True)
    test_loader = data_utils.DataLoader(dev_data,
                                        batch_size=64,
                                        drop_last=True)

    word_embed = word_embed.idx_to_vec.asnumpy()
    word_embed = torch.from_numpy(word_embed).float()
    token_label_embed = torch.from_numpy(token_label_embed).float()
    sentence_label_embed = torch.from_numpy(sentence_label_embed).float()

    lwan_model = lwan.LabelWordAttention(
        drop_out=0.1,
        batch_size=64,
        emb_dim=300,
        trans_hdim=args.hidden_size,
        d_a=200,
        token_tag_map=token_tag_map,
        sentence_tag_map=sentence_tag_map,
        token_label_embeddings=token_label_embed,
        embeddings=word_embed)

    san_model = lwan.LabelSentenceAttention(
        lstm_hdim=args.hidden_size,
        emb_dim=300,
        drop_out=0.1,
        d_a=200,
        sentence_tag_map=sentence_tag_map,
        sentence_label_embeddings=sentence_label_embed)

    criterion = torch.nn.BCELoss()
    combined_params = list(lwan_model.parameters()) + list(
        san_model.parameters())
    opt = torch.optim.Adam(combined_params, lr=0.001)
    train(lwan_model,
          transfomer_model,
          abs_transformer_model,
          san_model,
          train_loader,
          test_loader,
          criterion,
          opt,
          outputdir,
          chkpointdir,
          vocab_map,
          token_tag_map,
          sentence_tag_map,
          sent_pool='mean',
          epochs=10,
          abs_encoder=args.abs_encoder,
          GPU=True)
Exemple #16
0
def initialize_training(text_column_index,
                        label_column_index,
                        delimiter=';',
                        model_type=None,
                        model=None,
                        max_epochs=10,
                        patience=3,
                        use_amp=0,
                        calc_class_weights=0):
    """
    Create a text classification model using FLAIR, SentenceTransformers and
    Huggingface Transformers.
    Params:
    data_folder_path: Folder path with each file titled appropriately i.e.
                      train.csv test.csv dev.csv.
                      Will create a 80/10/10 split if only train is supplied.
    output_folder_path: Folder path for storing the best model & checkpoints.
    text_column_index: In which index (starting from 0) the input column is located.
    label_column_index: In which index (starting from 0) the label column is located.
    delimiter: type of delimiter used in the .csv file.
    model_type: SentenceTransformerDocumentEmbeddings or TransformerDocumentEmbeddings
    model: Which model to use.
    max_epochs: Number of epochs to train the model for.
    patience: Number of epochs without improvement before adjusting learning rate.
    use_amp: Whether to enable automatic mixed precisions (AMP).
    calc_class_weights: Whether to create a dictionary with class weights to deal
                        with imbalanced datasets.
    Output:
        best-model.pt
        final-model.pt
        training.log
    """

    # 1. Column format indicating which columns hold the text and label(s)
    column_name_map = {
        text_column_index: "text",
        label_column_index: "label_topic"
    }

    # 2. Load corpus containing training, test and dev data.
    corpus: Corpus = CSVClassificationCorpus("/root/text-classification/data/",
                                             column_name_map,
                                             skip_header=True,
                                             delimiter=delimiter)

    # Print statistics about the corpus.
    training_data_statistics = corpus.obtain_statistics()
    print(training_data_statistics)

    # 3A. Create a label dictionary.
    label_dict = corpus.make_label_dictionary()

    # 3B. Calculate class weights.
    if bool(calc_class_weights):
        weight_dict = create_weight_dict(delimiter=delimiter,
                                         label_index=label_column_index)
    else:
        weight_dict = None

    # 4. Initialize the sentence_transformers model.
    if model_type == "SentenceTransformerDocumentEmbeddings":
        document_embeddings = SentenceTransformerDocumentEmbeddings(model)
    elif model_type == "TransformerDocumentEmbeddings":
        document_embeddings = TransformerDocumentEmbeddings(model,
                                                            fine_tune=True)
    elif model_type == "WordEmbeddings":
        word_embeddings = [WordEmbeddings(model)]
        document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                    hidden_size=256)
    elif model_type == "StackedEmbeddings":
        document_embeddings = DocumentRNNEmbeddings([
            WordEmbeddings('glove'),
            FlairEmbeddings(model + '-backward'),
            FlairEmbeddings(model + '-forward')
        ])
    else:
        raise Exception(
            "Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings."
        )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                loss_weights=weight_dict)

    # 6. initialize the text classifier trainer with Adam optimizer
    trainer = ModelTrainer(classifier,
                           corpus,
                           optimizer=Adam,
                           use_tensorboard=False)

    # 7. start the training
    trainer.train("/root/text-classification/checkpoint/",
                  learning_rate=3e-5,
                  max_epochs=max_epochs,
                  patience=patience,
                  use_amp=bool(use_amp),
                  checkpoint=True,
                  mini_batch_size=16,
                  mini_batch_chunk_size=4)
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
import pandas as pd
import numpy as np
import sys

text_col = "review_text"
id_col = "row_id"

file_name = sys.argv[1]
batch_size = int(sys.argv[2])

df = pd.read_csv(file_name)
embedding = TransformerDocumentEmbeddings('bert-base-uncased')
outs = list()

df['batch'] = np.arange(len(df)) // batch_size
for b in df['batch'].unique():
    print(b)
    current_batch = df[df['batch'] == b]
    out = current_batch[text_col].apply(lambda k: pd.Series(
        embedding.embed(Sentence(k))[0].embedding.tolist()))
    out = pd.concat([current_batch[id_col], out], axis=1)
    outs.append(out)

outs = pd.concat(outs)
outs.columns = [id_col] + ['emb_' + str(c) for c in outs.columns[1:]]
outs.to_csv("embeddings.csv")
Exemple #18
0
def load_bert_embeddings(ename):
    # See BERT paper, section 5.3 and table 7 for layers
    return TransformerDocumentEmbeddings(ename, layers='-1,-2,-3,-4')
Exemple #19
0
def generate_embeddings(docs,
                        batch_size,
                        model_name='bert-base-cased',
                        pooling='mean',
                        offset=0):
    """
    Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and
    returns a list tuple. The first element represents failure (0) or success (1 or 2) and
    the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch
    if unsuccessful.
    The first element is 1, if batch_size embeddings were created
    :param docs: a list of strings for which embeddings should be created
    :param batch_size: integer representing how many embeddings should be created at once
    :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base
    :param pooling: the pooling strategy to generate Document Embeddings
    :param offset: the offset of the integers, for printing out the correct index
    :return: a tuple (success/failure, embeddings/failed_indices)
    """
    rest = len(docs) % batch_size
    model = False
    if pooling == 'mean':
        embedding = TransformerWordEmbeddings(model_name,
                                              layers='-1',
                                              allow_long_sentences=True)
        model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none')
    elif pooling == 'CLS':
        model = TransformerDocumentEmbeddings(model_name)
    if model:
        for i in range(0, len(docs) - rest, batch_size):
            sentences = [
                Sentence(sentence) for sentence in docs[i:i + batch_size]
            ]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            sentences = [Sentence(sentence) for sentence in docs[-rest:]]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    elif pooling == 'SentenceBert':
        model = SentenceTransformer(model_name)
        for i in range(0, len(docs) - rest, batch_size):
            try:
                embeddings = model.encode(docs[i:i + batch_size])
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, embeddings
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            try:
                embeddings = model.encode(docs[-rest:])
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, embeddings
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    else:
        raise Exception("No Valid model")
Exemple #20
0
import os

import joblib
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from mair.data_loading import load_legal_documents
from tqdm import tqdm

OUT_DIR = "data/processed"
OUT_FILE = "docs-bert-embeddings.joblib"
out_path = os.path.join(OUT_DIR, OUT_FILE)

embedder = TransformerDocumentEmbeddings("roberta-base")
os.makedirs(OUT_DIR, exist_ok=True)


def get_bert_embedding(text):
    sent = Sentence(text)
    sent = embedder.embed(sent)[0]
    return sent.embedding


data = load_legal_documents()

embeddings = dict()
for p, text in tqdm(data.items()):
    embeddings[p] = get_bert_embedding(text)

joblib.dump(embeddings, out_path)
Exemple #21
0
def train_classifier(pre_trained_model,
                     layer,
                     lr,
                     batch_size,
                     pooling_sub_token,
                     epochs,
                     hidden_size,
                     word_level=False,
                     task='text_classification'):
    # corpus = NLPTaskDataFetcher.load_classification_corpus(data_folder='label_embs_2/', test_file='test.csv', train_file='train.csv', dev_file='dev.csv')
    if not word_level:
        document_embeddings = TransformerDocumentEmbeddings(pre_trained_model,
                                                            fine_tune=True)
    else:
        token_embeddings = TransformerWordEmbeddings(
            pre_trained_model,
            layers=layer,
            pooling_operation=pooling_sub_token,
            fine_tune=True)

    #text classification
    if task == 'text_classification':
        corpus: Corpus = ClassificationCorpus(data_folder=dataset_folder,
                                              test_file='test.txt',
                                              dev_file='dev.txt',
                                              train_file='train.txt')
        label_dict = corpus.make_label_dictionary()
        classifier = TextClassifier(document_embeddings=token_embeddings,
                                    label_dictionary=label_dict,
                                    multi_label=False)
        # trainer = ModelTrainer(model=classifier, corpus=corpus, optimizer=SGD)
    #sequence labelling
    elif task == 'sequence_labelling':
        columns = {0: 'text', 1: 'tag'}
        corpus: Corpus = ColumnCorpus(dataset_folder,
                                      columns,
                                      train_file='train.txt',
                                      test_file='test.txt',
                                      dev_file='dev.txt')
        token_tag_dictionary = corpus.make_tag_dictionary(tag_type=columns[1])
        embedding_types = [
            TransformerWordEmbeddings(pre_trained_model,
                                      layers=layer,
                                      pooling_operation=pooling_sub_token,
                                      fine_tune=True)
        ]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
        classifier: SequenceTagger = SequenceTagger(
            hidden_size=hidden_size,
            embeddings=embeddings,
            tag_dictionary=token_tag_dictionary,
            tag_type=columns[1],
            use_crf=True)
    trainer: ModelTrainer = ModelTrainer(model=classifier,
                                         corpus=corpus,
                                         optimizer=SGD)
    trainer.train(dest_folder + '/{}-output'.format(task),
                  learning_rate=lr,
                  mini_batch_size=batch_size,
                  max_epochs=epochs)
    def __init__(self,
                 word_embedding_base: str = None,
                 document_embedding: str = None,
                 fine_tune: bool = False,
                 pretuned: bool = False):
        """

        :param word_embedding_base: - glove: 'glove', (only en), - fasttext: 'en', 'de'
        :param document_embedding:  pool vs rnn for w2v mode - bert: 'bert', 'bert-de'  - 'longformer' (only en) -
        'flair', 'stacked-flair', 'flair-de', 'stacked-flair-de'
        """
        # document embedding
        self.fine_tune = fine_tune
        self.document_embedding = None
        if word_embedding_base:
            self.word_embedding_base = WordEmbeddings(word_embedding_base)

            if document_embedding.lower() == 'pool':
                self.document_embedding = DocumentPoolEmbeddings(
                    [self.word_embedding_base])
            elif document_embedding.lower() == 'rnn':
                self.document_embedding = DocumentRNNEmbeddings(
                    [self.word_embedding_base])
            else:
                raise UserWarning(
                    f'{document_embedding} is not supported for combination with word embeedings'
                )
        elif document_embedding:
            print(document_embedding, pretuned)
            if pretuned:
                if document_embedding.lower(
                ) == 'bert' or document_embedding.lower() == 'bert-de':
                    self.document_embedding = SentenceTransformer(
                        'stsb-bert-large')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-bert-large')
                elif document_embedding.lower() == 'roberta':
                    self.document_embedding = SentenceTransformer(
                        'stsb-roberta-large')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-roberta-large')
                elif document_embedding.lower() == 'xlm':
                    self.document_embedding = SentenceTransformer(
                        'stsb-xlm-r-multilingual')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-xlm-r-multilingual')
            else:
                if document_embedding.lower() == 'bert':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'bert-base-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'bert-de':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'bert-base-german-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'longformer':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'allenai/longformer-base-4096', fine_tune=fine_tune)
                elif document_embedding.lower() == 'xlnet':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'xlnet-base-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'xlnet-de':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'xlm-mlm-ende-1024', fine_tune=fine_tune)
                elif document_embedding.lower() == 'flair':
                    self.document_embedding = FlairEmbeddings(
                        'en-forward', fine_tune=fine_tune)
                elif document_embedding.lower() == 'flair-de':
                    self.document_embedding = FlairEmbeddings(
                        'de-forward', fine_tune=fine_tune)
                elif document_embedding.lower() == 'stack-flair':
                    self.document_embedding = StackedEmbeddings([
                        FlairEmbeddings('en-forward'),
                        FlairEmbeddings('en-backward'),
                    ])
                elif document_embedding.lower() == 'stack-flair-de':
                    self.document_embedding = StackedEmbeddings([
                        FlairEmbeddings('de-forward'),
                        FlairEmbeddings('de-backward'),
                    ])
        else:
            raise UserWarning(f'No embeddings defined')
Exemple #23
0
 def from_transformers(cls) -> 'SimpleFeaturizer':
     return cls(TransformerDocumentEmbeddings())
Exemple #24
0
    # 1. get the corpus
    column_name_map = {0: config["label_name"], 1: "text"}
    corpus: Corpus = CSVClassificationCorpus(
        config["data_folder"],
        column_name_map,
        skip_header=True,
        delimiter='\t',  # tab-separated files
    )
    print(corpus)

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()
    class_weights = utils.get_inverted_class_balance(corpus.train.dataset)

    # 3. initialize transformer document embeddings (many models are available)
    document_embeddings = TransformerDocumentEmbeddings(
        'allenai/scibert_scivocab_uncased', fine_tune=True)

    # 4. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                loss_weights=class_weights)

    # 5. initialize the text classifier trainer with Adam optimizer
    trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

    # 6. start the training
    trainer.train(
        sys.argv[2],
        learning_rate=3e-5,  # use very small learning rate
        mini_batch_size=16,
        mini_batch_chunk_size=
Exemple #25
0
    if not fine_tune:
        # define the RNN model
        document_embeddings = DocumentRNNEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256,
            rnn_type="LSTM",
            bidirectional=True,
            rnn_layers=2,
        )

    # Case 2: fine-tune transformer model and use CLS output
    else:
        transformer_model = "roberta-large"
        document_embeddings = TransformerDocumentEmbeddings(
            model=transformer_model, fine_tune=True)

    # define the neural classifier
    classifier = TextClassifier(
        document_embeddings,
        label_dictionary=corpus.make_label_dictionary(),
        multi_label=False)

    # define the training regime for model+RNN
    if not fine_tune:
        # train model
        trainer = ModelTrainer(classifier, corpus)

        trainer.train(
            base_path="{}".format(path2[i]),
            max_epochs=epochs,
Exemple #26
0
class CompanyMatcher:
    companies: Dict[str, Company] = {}

    embedding: Union[TransformerDocumentEmbeddings,
                     SentenceTransformerDocumentEmbeddings]
    description_embeddings: Dict[str, torch.tensor] = {}
    embeddings_pkl_file_path: Path = Path("description-embeddings.pkl")

    geolocator: Nominatim = Nominatim(user_agent="company-matcher")
    headquarters_locations: Dict[str, Tuple[float, float]] = {}
    locations_pkl_file_path: Path = Path("headquarters-locations.pkl")

    _similarity_component_weights: Dict[str, float] = {
        "description": 0.6,
        "founded": 0.2,
        "headquarters": 0.2
    }

    _founding_year_normalizer: int = None
    _hq_location_normalizer: int = 20000  # half of earth diameter

    # largest distance between two headquarters takes two long to compute
    # (at least with the inefficient method implemented in _calc_hq_location_normalizer

    def __init__(self,
                 company_file_path: Path,
                 transformer_model: str,
                 sentence_transformer: bool = False,
                 similarity_component_weights: Dict[str, float] = None,
                 load_n: int = None):
        self._load_transformer_model(transformer_model, sentence_transformer)
        self._load_companies(company_file_path, load_n)
        self._embed_descriptions()
        self._locate_headquarters()
        self._load_similarity_component_weights(similarity_component_weights)
        self._calc_founding_year_normalizer()

    def _load_similarity_component_weights(
            self, similarity_component_weights: Dict[str, float]):
        if similarity_component_weights:
            if {"description", "founded", "headquarters"
                } == set(self._similarity_component_weights.keys()):
                self._similarity_component_weights = similarity_component_weights
            else:
                logger.warning(
                    f"Invalid similarity component weights gives: {similarity_component_weights}"
                )
                logger.warning("Using default values!")

    def _load_transformer_model(self, transformer_model: str,
                                sentence_transformer: bool):
        logger.info("Loading transformer model...")
        if sentence_transformer:
            try:
                self.embedding = SentenceTransformerDocumentEmbeddings(
                    transformer_model, )
            except OSError as e:
                logger.error("Could not load transformer model: " + str(e))
                exit()
        else:
            try:
                self.embedding = TransformerDocumentEmbeddings(
                    transformer_model, fine_tune=False)
            except OSError as e:
                logger.error("Could not load sentence transformer model: " +
                             str(e))
                exit()
        logger.info("Done loading transformer model!")

    def _load_companies(self, company_file_path: Path, load_n: int):
        logger.info("Loading company data from file...")
        try:
            json_data = load_json(company_file_path)
        except OSError as e:
            logger.error("Could not company data file: " + str(e))
            exit()

        if 0 < load_n <= len(json_data):
            json_data = json_data[:load_n]

        try:
            companies_list = [Company(**entry) for entry in json_data]
        except ValidationError as e:
            logger.error("Company data does not follow valid format: " +
                         str(e))
            exit()

        try:
            companies_url_list = [c.url for c in companies_list]
            assert len(companies_url_list) == len(set(companies_url_list))
        except AssertionError:
            logger.warning("Company URLs are not unique!")

        # check which which companies are duplicates
        duplicate_company_urls = []
        for company in companies_list:
            if self.companies.get(company.url, None):
                duplicate_company_urls.append(company.url)
            else:
                self.companies[company.url] = company
        logger.warning(
            f"Following company URLs have multiple entries: {duplicate_company_urls}"
        )
        logger.warning("Duplicate entries will be ignored!")

        logger.info("Done loading company data!")

    def _embed_descriptions(self,
                            chunk_size: int = 30,
                            load_from_pickle: bool = True,
                            save_to_pickle: bool = True):
        if load_from_pickle:
            self.description_embeddings = load_pickle(
                self.embeddings_pkl_file_path,
                error_msg="Could not load stored embeddings!")

        descriptions_ = [(company.url, Sentence(company.description))
                         for company in self.companies.values()
                         if company.url not in self.description_embeddings]

        # chunking for progress bar
        if descriptions_:
            logger.info("Computing description embeddings...")

            with tqdm(total=len(descriptions_)) as pbar:
                for start_idx in range(0, len(descriptions_), chunk_size):
                    end_idx = start_idx + chunk_size
                    if not end_idx < len(descriptions_):
                        end_idx = len(descriptions_)
                        chunk_size = end_idx - start_idx

                    descriptions_chunk = descriptions_[start_idx:end_idx]
                    self.embedding.embed([
                        description_[1] for description_ in descriptions_chunk
                    ])
                    self.description_embeddings.update({
                        description_[0]: description_[1].embedding
                        for description_ in descriptions_chunk
                    })
                    # remove embedding from sentence objects
                    for _, description_sentence in descriptions_chunk:
                        description_sentence.clear_embeddings()

                    if save_to_pickle:
                        save_pickle(
                            object_=self.description_embeddings,
                            pkl_file_path=self.embeddings_pkl_file_path,
                            error_msg="Could not save new embeddings!")

                    pbar.update(chunk_size)

                    # DEBUGGING #
                    # snapshot = tracemalloc.take_snapshot()
                    # display_top_malloc_lines(snapshot)
                    # --------- #

            logger.info("Done computing description embeddings!")

    def _locate_headquarters(self,
                             load_from_pickle: bool = True,
                             save_to_pickle: bool = True):
        if load_from_pickle:
            self.headquarters_locations = load_pickle(
                self.locations_pkl_file_path,
                error_msg="Could not load stored locations!")

        not_located_companies = [
            company_url for company_url in self.companies
            if company_url not in self.headquarters_locations
        ]

        # not_located_companies = []  # DEBUGGING

        if not_located_companies:
            logger.info("Geo-locating company headquarters...")

            for company_url in not_located_companies:
                company = self.companies[company_url]
                if company.headquarters and not self.headquarters_locations.get(
                        company_url, None):
                    location = None
                    try:
                        location = self.geolocator.geocode(
                            company.headquarters)
                    except (MaxRetryError, GeocoderUnavailable):
                        pass
                    if location:
                        self.headquarters_locations[company_url] = (
                            location.latitude, location.longitude)

                        if save_to_pickle:
                            save_pickle(
                                object_=self.headquarters_locations,
                                pkl_file_path=self.locations_pkl_file_path,
                                error_msg="Could not save locations!")

            logger.info("Done locating company headquarters!")

    def _calc_founding_year_normalizer(self):
        company_founding_years = [
            company.founded for company in self.companies.values()
            if company.founded
        ]
        min_founding_year = min(company_founding_years)
        max_founding_year = max(company_founding_years)
        self._founding_year_normalizer = max_founding_year - min_founding_year

    # function is too inefficient for datasets with more than a few hundred entries
    # def _calc_hq_location_normalizer(self):
    #     location_pairs = combinations(self.headquarters_locations.values(), 2)
    #     location_distances = [geodesic(location_pair[0], location_pair[1]).kilometers
    #                           for location_pair in location_pairs]
    #     self._hq_location_normalizer = max(location_distances)

    # -----------------------------#
    # --- Similarity functions --- #

    def _description_similarities(self, query_company: Company):
        query_embedding = self.description_embeddings[query_company.url]

        return {
            candidate_url:
            float(cos_similarity(query_embedding, candidate_embedding))
            for candidate_url, candidate_embedding in
            self.description_embeddings.items()
            if candidate_url != query_company.url
        }

    def _founded_similarities(self, query_company: Company):
        return {
            company.url: self._calc_founded_similarity(query_company, company)
            for company in self.companies.values()
            if company.url != query_company.url
        }

    def _calc_founded_similarity(
            self, query_company: Company,
            candidate_company: Company) -> Union[float, None]:
        if query_company.founded and candidate_company.founded:
            return 1 - abs(query_company.founded - candidate_company.founded
                           ) / float(self._founding_year_normalizer)
        else:
            return None

    def _headquarters_similarities(self, query_company: Company):
        return {
            company.url:
            self._calc_headquarters_similarity(query_company, company)
            for company in self.companies.values()
            if company.url != query_company.url
        }

    def _calc_headquarters_similarity(
            self, query_company: Company,
            candidate_company: Company) -> Union[float, None]:
        query_location = self.headquarters_locations.get(
            query_company.url, None)
        candidate_location = self.headquarters_locations.get(
            candidate_company.url, None)

        if query_location and candidate_location:
            return 1 - geodesic(query_location,
                                candidate_location).kilometers / float(
                                    self._hq_location_normalizer)
        else:
            return None

    def get_peers(self,
                  query_url: str,
                  top_k: int = 10) -> Union[List[str], None]:
        if query_url not in self.companies:
            logger.warning(f"Company with URL '{query_url}' does not exist!")
            return None

        query_company = self.companies[query_url]

        description_similarities = self._description_similarities(
            query_company)
        founded_similarities = self._founded_similarities(query_company)
        headquarters_similarities = self._headquarters_similarities(
            query_company)

        similarities = []
        for company_url in description_similarities:
            if founded_similarities[company_url] and headquarters_similarities[
                    company_url]:
                similarity = (
                    self._similarity_component_weights['description'] *
                    description_similarities[company_url] +
                    self._similarity_component_weights['founded'] *
                    founded_similarities[company_url] +
                    self._similarity_component_weights['headquarters'] *
                    headquarters_similarities[company_url])

            elif founded_similarities[
                    company_url] and not headquarters_similarities[company_url]:
                weight_normalizer = self._similarity_component_weights[
                    'description'] + self._similarity_component_weights[
                        'founded']
                similarity = (self._similarity_component_weights['description']
                              * description_similarities[company_url] +
                              self._similarity_component_weights['founded'] *
                              founded_similarities[company_url]
                              ) / float(weight_normalizer)

            elif not founded_similarities[
                    company_url] and headquarters_similarities[company_url]:
                weight_normalizer = self._similarity_component_weights[
                    'description'] + self._similarity_component_weights[
                        'headquarters']
                similarity = (
                    self._similarity_component_weights['description'] *
                    description_similarities[company_url] +
                    self._similarity_component_weights['headquarters'] *
                    headquarters_similarities[company_url]
                ) / float(weight_normalizer)

            else:
                weight_normalizer = self._similarity_component_weights['description'] \
                    + self._similarity_component_weights['founded'] + self._similarity_component_weights['headquarters']
                similarity = description_similarities[company_url] / float(
                    weight_normalizer)

            similarities.append((company_url, similarity))

        sorted_similarities = sorted(similarities,
                                     key=lambda x: x[1],
                                     reverse=True)
        return [x[0] for x in sorted_similarities][:top_k]
def test_transformer_document_embeddings():

    embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased')

    sentence: Sentence = Sentence("I love Berlin")
    embeddings.embed(sentence)

    assert len(sentence.get_embedding()) == 768

    sentence.clear_embeddings()

    assert len(sentence.get_embedding()) == 0

    embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased',
                                               layers='all')

    embeddings.embed(sentence)

    assert len(sentence.get_embedding()) == 5376

    sentence.clear_embeddings()

    assert len(sentence.get_embedding()) == 0

    embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased',
                                               layers='all',
                                               layer_mean=True)

    embeddings.embed(sentence)

    assert len(sentence.get_embedding()) == 768

    sentence.clear_embeddings()

    del embeddings
Exemple #28
0
    corpus.filter_empty_sentences()
    print(corpus)

    label_dictionary = corpus.make_label_dictionary()

    print(label_dictionary)

    flat_labels = [item for sublist in labels for item in sublist]
    class_weights = compute_class_weight('balanced', np.unique(flat_labels),
                                         flat_labels)
    unique_labels = np.unique(flat_labels)
    weights = {}
    for i in range(len(unique_labels)):
        weights[unique_labels[i]] = class_weights[i]

    document_embeddings = TransformerDocumentEmbeddings(
        params['version_model'], fine_tune=True)

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dictionary,
                                loss_weights=weights)

    trainer = ModelTrainer(classifier, corpus)

    trainer.train(params['model_dir'],
                  learning_rate=params['learning_rate'],
                  mini_batch_size=params['batch_size'],
                  anneal_factor=params['anneal_factor'],
                  patience=params['patience'],
                  max_epochs=params['epochs'],
                  embeddings_storage_mode=params['embeddings_storage_mode'])