def get_labelled_corpus(self, samples: List[Sentence]): train_file_name = str(f'labelled_{len(samples)}_{int(time())}.csv') self.save_labelled_csv(samples, train_file_name) corpus = CSVClassificationCorpus(data_folder=self.experiment_name, column_name_map={ 0: 'text', 1: 'label' }, train_file=train_file_name, dev_file=self.valid_file, test_file=self.test_file, skip_header=False) return corpus
def __init__(self, data_folder): self.column_name_map = {0: "text", 1: "label"} self.corpus: Corpus = CSVClassificationCorpus(data_folder, self.column_name_map, skip_header=True, delimiter='\t') self.label_dict = self.corpus.make_label_dictionary() self.embeddings = [ FlairEmbeddings('hi-forward'), FlairEmbeddings('hi-backward'), ] self.download_path_split = torch.__file__.split("/")[:-2] self.download_dir = "/".join(self.download_path_split) + "/HindiNLP" self.dest_path = self.download_dir + "/resources/taggers/classifiers"
def __init__(self, path: Union[Path, str], column_name_map: dict = None, corpus: Corpus = None, **corpus_params): if isinstance(path, str): path = Path(path) assert path.exists() self.path = path if corpus: self.corpus = corpus else: if column_name_map: self.corpus = CSVClassificationCorpus(self.path, column_name_map, **corpus_params) else: self.corpus = ClassificationCorpus(self.path, **corpus_params) self.sentences = self.corpus.get_all_sentences() print(self.corpus)
def initialize_training(text_column_index, label_column_index, delimiter=';', model_type=None, model=None, max_epochs=10, patience=3, use_amp=0, calc_class_weights=0): """ Create a text classification model using FLAIR, SentenceTransformers and Huggingface Transformers. Params: data_folder_path: Folder path with each file titled appropriately i.e. train.csv test.csv dev.csv. Will create a 80/10/10 split if only train is supplied. output_folder_path: Folder path for storing the best model & checkpoints. text_column_index: In which index (starting from 0) the input column is located. label_column_index: In which index (starting from 0) the label column is located. delimiter: type of delimiter used in the .csv file. model_type: SentenceTransformerDocumentEmbeddings or TransformerDocumentEmbeddings model: Which model to use. max_epochs: Number of epochs to train the model for. patience: Number of epochs without improvement before adjusting learning rate. use_amp: Whether to enable automatic mixed precisions (AMP). calc_class_weights: Whether to create a dictionary with class weights to deal with imbalanced datasets. Output: best-model.pt final-model.pt training.log """ # 1. Column format indicating which columns hold the text and label(s) column_name_map = { text_column_index: "text", label_column_index: "label_topic" } # 2. Load corpus containing training, test and dev data. corpus: Corpus = CSVClassificationCorpus("/root/text-classification/data/", column_name_map, skip_header=True, delimiter=delimiter) # Print statistics about the corpus. training_data_statistics = corpus.obtain_statistics() print(training_data_statistics) # 3A. Create a label dictionary. label_dict = corpus.make_label_dictionary() # 3B. Calculate class weights. if bool(calc_class_weights): weight_dict = create_weight_dict(delimiter=delimiter, label_index=label_column_index) else: weight_dict = None # 4. Initialize the sentence_transformers model. if model_type == "SentenceTransformerDocumentEmbeddings": document_embeddings = SentenceTransformerDocumentEmbeddings(model) elif model_type == "TransformerDocumentEmbeddings": document_embeddings = TransformerDocumentEmbeddings(model, fine_tune=True) elif model_type == "WordEmbeddings": word_embeddings = [WordEmbeddings(model)] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256) elif model_type == "StackedEmbeddings": document_embeddings = DocumentRNNEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings(model + '-backward'), FlairEmbeddings(model + '-forward') ]) else: raise Exception( "Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings." ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights=weight_dict) # 6. initialize the text classifier trainer with Adam optimizer trainer = ModelTrainer(classifier, corpus, optimizer=Adam, use_tensorboard=False) # 7. start the training trainer.train("/root/text-classification/checkpoint/", learning_rate=3e-5, max_epochs=max_epochs, patience=patience, use_amp=bool(use_amp), checkpoint=True, mini_batch_size=16, mini_batch_chunk_size=4)
} for split_name, split_value in labels_splits.items(): df = pd.DataFrame(split_value) df.to_csv(os.path.join(data_path, f'{split_name}.csv'), index=False, header=False, sep="\t") from flair.data import Corpus from flair.datasets import CSVClassificationCorpus corpus = CSVClassificationCorpus(data_path, column_name_map={ 0: 'label', 1: 'text' }, skip_header=False, delimiter='\t', in_memory=False, max_tokens_per_doc=1000 * 10) import torch import flair flair.devide = torch.device('cuda:0') from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer # 2. create the label dictionary label_dict = corpus.make_label_dictionary()
def train_model(self, model_name="text_classification_model", custom_word_embeddings=None, rnn_type="GRU", use_pool_embedding=False, hidden_size=16, reproject_words=True, reproject_words_dimension=128, learning_rate=1e-3, batch_size=8, anneal_factor=0.5, patience=2, max_epochs=30, **kwargs): """ Train flair model and save it in your data folder Parameters ---------- model_name: str Name of your model custom_word_embeddings: list<embedding> Use custom flair embedding See more in flair documentation: https://github.com/zalandoresearch/flair/tree/master/resources/docs Return ------- None """ self.model_name = model_name corpus = CSVClassificationCorpus(self.data_folder, self.column_name_map, skip_header=True) label_dict = corpus.make_label_dictionary() # Word embedding selection if custom_word_embeddings is None: word_embeddings = [WordEmbeddings('fr')] else: word_embeddings = custom_word_embeddings # initialize document embedding by passing list of word embeddings and parameters if use_pool_embedding: document_embeddings = DocumentPoolEmbeddings( word_embeddings, pooling='max', fine_tune_mode='nonlinear') else: document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=hidden_size, reproject_words=reproject_words, reproject_words_dimension=reproject_words_dimension, rnn_type=rnn_type) # create the text classifier and initialize trainer classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # let's train ! num_workers = cpu_count() trainer.train("{0}\\{1}".format(self.data_folder, self.model_name), learning_rate=learning_rate, num_workers=num_workers, mini_batch_size=batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, **kwargs)
from flair.embeddings import CharacterEmbeddings, DocumentPoolEmbeddings, DocumentRNNEmbeddings, BytePairEmbeddings, DocumentLSTMEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from torch.optim.adam import Adam from flair.datasets import CSVClassificationCorpus for lang in ['malayalam', 'tamil']: corpus = CSVClassificationCorpus('data', train_file=f'train_{lang}.tsv', test_file=f'{lang}_dev.tsv', dev_file=f'dev_{lang}.tsv', delimiter='\t', skip_header=True, column_name_map={ 0: 'text', 1: 'label_topic' }) label_dict = corpus.make_label_dictionary() word_embeddings = [CharacterEmbeddings()] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) #trainer = ModelTrainer(classifier, corpus) trainer = ModelTrainer(classifier, corpus, optimizer=Adam) trainer.train(f'models/char/{lang}', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=20)