Ejemplo n.º 1
0
 def get_labelled_corpus(self, samples: List[Sentence]):
     train_file_name = str(f'labelled_{len(samples)}_{int(time())}.csv')
     self.save_labelled_csv(samples, train_file_name)
     corpus = CSVClassificationCorpus(data_folder=self.experiment_name,
                                      column_name_map={
                                          0: 'text',
                                          1: 'label'
                                      },
                                      train_file=train_file_name,
                                      dev_file=self.valid_file,
                                      test_file=self.test_file,
                                      skip_header=False)
     return corpus
Ejemplo n.º 2
0
 def __init__(self, data_folder):
     self.column_name_map = {0: "text", 1: "label"}
     self.corpus: Corpus = CSVClassificationCorpus(data_folder,
                                                   self.column_name_map,
                                                   skip_header=True,
                                                   delimiter='\t')
     self.label_dict = self.corpus.make_label_dictionary()
     self.embeddings = [
         FlairEmbeddings('hi-forward'),
         FlairEmbeddings('hi-backward'),
     ]
     self.download_path_split = torch.__file__.split("/")[:-2]
     self.download_dir = "/".join(self.download_path_split) + "/HindiNLP"
     self.dest_path = self.download_dir + "/resources/taggers/classifiers"
Ejemplo n.º 3
0
    def __init__(self,
                 path: Union[Path, str],
                 column_name_map: dict = None,
                 corpus: Corpus = None,
                 **corpus_params):
        if isinstance(path, str):
            path = Path(path)
        assert path.exists()

        self.path = path
        if corpus:
            self.corpus = corpus
        else:
            if column_name_map:
                self.corpus = CSVClassificationCorpus(self.path,
                                                      column_name_map,
                                                      **corpus_params)
            else:
                self.corpus = ClassificationCorpus(self.path, **corpus_params)
        self.sentences = self.corpus.get_all_sentences()
        print(self.corpus)
Ejemplo n.º 4
0
def initialize_training(text_column_index,
                        label_column_index,
                        delimiter=';',
                        model_type=None,
                        model=None,
                        max_epochs=10,
                        patience=3,
                        use_amp=0,
                        calc_class_weights=0):
    """
    Create a text classification model using FLAIR, SentenceTransformers and
    Huggingface Transformers.
    Params:
    data_folder_path: Folder path with each file titled appropriately i.e.
                      train.csv test.csv dev.csv.
                      Will create a 80/10/10 split if only train is supplied.
    output_folder_path: Folder path for storing the best model & checkpoints.
    text_column_index: In which index (starting from 0) the input column is located.
    label_column_index: In which index (starting from 0) the label column is located.
    delimiter: type of delimiter used in the .csv file.
    model_type: SentenceTransformerDocumentEmbeddings or TransformerDocumentEmbeddings
    model: Which model to use.
    max_epochs: Number of epochs to train the model for.
    patience: Number of epochs without improvement before adjusting learning rate.
    use_amp: Whether to enable automatic mixed precisions (AMP).
    calc_class_weights: Whether to create a dictionary with class weights to deal
                        with imbalanced datasets.
    Output:
        best-model.pt
        final-model.pt
        training.log
    """

    # 1. Column format indicating which columns hold the text and label(s)
    column_name_map = {
        text_column_index: "text",
        label_column_index: "label_topic"
    }

    # 2. Load corpus containing training, test and dev data.
    corpus: Corpus = CSVClassificationCorpus("/root/text-classification/data/",
                                             column_name_map,
                                             skip_header=True,
                                             delimiter=delimiter)

    # Print statistics about the corpus.
    training_data_statistics = corpus.obtain_statistics()
    print(training_data_statistics)

    # 3A. Create a label dictionary.
    label_dict = corpus.make_label_dictionary()

    # 3B. Calculate class weights.
    if bool(calc_class_weights):
        weight_dict = create_weight_dict(delimiter=delimiter,
                                         label_index=label_column_index)
    else:
        weight_dict = None

    # 4. Initialize the sentence_transformers model.
    if model_type == "SentenceTransformerDocumentEmbeddings":
        document_embeddings = SentenceTransformerDocumentEmbeddings(model)
    elif model_type == "TransformerDocumentEmbeddings":
        document_embeddings = TransformerDocumentEmbeddings(model,
                                                            fine_tune=True)
    elif model_type == "WordEmbeddings":
        word_embeddings = [WordEmbeddings(model)]
        document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                    hidden_size=256)
    elif model_type == "StackedEmbeddings":
        document_embeddings = DocumentRNNEmbeddings([
            WordEmbeddings('glove'),
            FlairEmbeddings(model + '-backward'),
            FlairEmbeddings(model + '-forward')
        ])
    else:
        raise Exception(
            "Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings."
        )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                loss_weights=weight_dict)

    # 6. initialize the text classifier trainer with Adam optimizer
    trainer = ModelTrainer(classifier,
                           corpus,
                           optimizer=Adam,
                           use_tensorboard=False)

    # 7. start the training
    trainer.train("/root/text-classification/checkpoint/",
                  learning_rate=3e-5,
                  max_epochs=max_epochs,
                  patience=patience,
                  use_amp=bool(use_amp),
                  checkpoint=True,
                  mini_batch_size=16,
                  mini_batch_chunk_size=4)
Ejemplo n.º 5
0
    }

    for split_name, split_value in labels_splits.items():
        df = pd.DataFrame(split_value)
        df.to_csv(os.path.join(data_path, f'{split_name}.csv'),
                  index=False,
                  header=False,
                  sep="\t")

from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
corpus = CSVClassificationCorpus(data_path,
                                 column_name_map={
                                     0: 'label',
                                     1: 'text'
                                 },
                                 skip_header=False,
                                 delimiter='\t',
                                 in_memory=False,
                                 max_tokens_per_doc=1000 * 10)

import torch
import flair
flair.devide = torch.device('cuda:0')
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()
Ejemplo n.º 6
0
    def train_model(self,
                    model_name="text_classification_model",
                    custom_word_embeddings=None,
                    rnn_type="GRU",
                    use_pool_embedding=False,
                    hidden_size=16,
                    reproject_words=True,
                    reproject_words_dimension=128,
                    learning_rate=1e-3,
                    batch_size=8,
                    anneal_factor=0.5,
                    patience=2,
                    max_epochs=30,
                    **kwargs):
        """
        Train flair model and save it in your data folder

        Parameters
        ----------
        model_name: str
            Name of your model
        custom_word_embeddings: list<embedding>
            Use custom flair embedding

        See more in flair documentation: https://github.com/zalandoresearch/flair/tree/master/resources/docs

        Return
        -------
        None
        """
        self.model_name = model_name
        corpus = CSVClassificationCorpus(self.data_folder,
                                         self.column_name_map,
                                         skip_header=True)
        label_dict = corpus.make_label_dictionary()

        # Word embedding selection
        if custom_word_embeddings is None:
            word_embeddings = [WordEmbeddings('fr')]
        else:
            word_embeddings = custom_word_embeddings

        # initialize document embedding by passing list of word embeddings and parameters
        if use_pool_embedding:
            document_embeddings = DocumentPoolEmbeddings(
                word_embeddings, pooling='max', fine_tune_mode='nonlinear')
        else:
            document_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=hidden_size,
                reproject_words=reproject_words,
                reproject_words_dimension=reproject_words_dimension,
                rnn_type=rnn_type)

        # create the text classifier and initialize trainer
        classifier = TextClassifier(document_embeddings,
                                    label_dictionary=label_dict)
        trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

        # let's train !
        num_workers = cpu_count()
        trainer.train("{0}\\{1}".format(self.data_folder, self.model_name),
                      learning_rate=learning_rate,
                      num_workers=num_workers,
                      mini_batch_size=batch_size,
                      anneal_factor=anneal_factor,
                      patience=patience,
                      max_epochs=max_epochs,
                      **kwargs)
Ejemplo n.º 7
0
from flair.embeddings import CharacterEmbeddings, DocumentPoolEmbeddings, DocumentRNNEmbeddings, BytePairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from torch.optim.adam import Adam
from flair.datasets import CSVClassificationCorpus

for lang in ['malayalam', 'tamil']:
    corpus = CSVClassificationCorpus('data',
                                     train_file=f'train_{lang}.tsv',
                                     test_file=f'{lang}_dev.tsv',
                                     dev_file=f'dev_{lang}.tsv',
                                     delimiter='\t',
                                     skip_header=True,
                                     column_name_map={
                                         0: 'text',
                                         1: 'label_topic'
                                     })
    label_dict = corpus.make_label_dictionary()
    word_embeddings = [CharacterEmbeddings()]
    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=256)
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)
    #trainer = ModelTrainer(classifier, corpus)
    trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
    trainer.train(f'models/char/{lang}',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=5,
                  max_epochs=20)