def test_train_tars(tasks_base_path, results_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore") # create a TARS classifier tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base") # switch to a new task (TARS can do multiple tasks so you must define one) tars.add_and_switch_to_new_task( task_name="question 2_CLASS", label_dictionary=corpus.make_label_dictionary(label_type="class"), label_type="class", ) # initialize the text classifier trainer trainer = ModelTrainer(tars, corpus) # start the training trainer.train( base_path=results_base_path, learning_rate=0.02, mini_batch_size=1, max_epochs=1, ) sentence = Sentence("This is great!") tars.predict(sentence)
def test_init_tars_and_switch(tasks_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb") # create a TARS classifier tars = TARSClassifier(task_name='2_CLASS', label_dictionary=corpus.make_label_dictionary(label_type='class'), label_type='class') # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 2) # switch to task with only one label tars.add_and_switch_to_new_task('1_CLASS', 'one class', "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 1) # switch to task with three labels provided as list tars.add_and_switch_to_new_task('3_CLASS', ['list 1', 'list 2', 'list 3'], "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 3) # switch to task with four labels provided as set tars.add_and_switch_to_new_task('4_CLASS', {'set 1', 'set 2', 'set 3', 'set 4'}, "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 4) # switch to task with two labels provided as Dictionary tars.add_and_switch_to_new_task('2_CLASS_AGAIN', corpus.make_label_dictionary(label_type='class'), "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 2)
def run_splits(word_embeddings, embeddings_name): for i in range(1, 6): print('##########') print('Split', str(i)) print('##########') data_folder = '<path_to_splits>/split_' + str(i) + '/' corpus = ClassificationCorpus(data_folder, test_file='test.csv', dev_file='dev.csv', train_file='train.csv') document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
def train_model(data_dir, max_epochs): st.write('Creating word corpus for training...') corpus = ClassificationCorpus(data_dir) label_dict = corpus.make_label_dictionary() st.write('Done') st.write('Load and create Embeddings for text data...') word_embeddings = [ WordEmbeddings('glove'), # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward') ] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) st.write('Done') st.write('Preparing') classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus) trainer.train('model-saves', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=8, max_epochs=max_epochs, checkpoint=True) st.write('Model Training Finished!')
def train(self, learning_rate: float = 0.1, mini_batch_size: int = 16, anneal_factor: float = 0.5, patience: int = 5, max_epochs: int = 10): """ :return: """ self.make_corpus() corpus = ClassificationCorpus(self.output_data_path, train_file='train.txt', dev_file='dev.txt', test_file='test.txt') label_dictionary = corpus.make_label_dictionary() embeddings = [WordEmbeddings('glove')] document_pool = DocumentPoolEmbeddings(embeddings) classifier = TextClassifier(document_pool, label_dictionary=label_dictionary) trainer = ModelTrainer(classifier, corpus) trainer.train( self.model_path, learning_rate=learning_rate, mini_batch_size=mini_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, )
def test_train_tars(tasks_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore") # create a TARS classifier tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base") # switch to a new task (TARS can do multiple tasks so you must define one) tars.add_and_switch_to_new_task(task_name="question 2_CLASS", label_dictionary=corpus.make_label_dictionary(label_type='class'), label_type='class', ) # initialize the text classifier trainer trainer = ModelTrainer(tars, corpus) # start the training trainer.train(base_path='resources/taggers/trec', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=1, # mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=1, # terminate after 10 epochs ) sentence = Sentence("This is great!") tars.predict(sentence)
def test_text_classifier_multi(results_base_path, tasks_base_path): flair.set_seed(123) corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_multi", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=DocumentPoolEmbeddings([turian_embeddings], fine_tune_mode="linear"), label_dictionary=label_dict, label_type="city", multi_label=True, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, mini_batch_size=2, max_epochs=50, shuffle=True) # check if model can predict sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") model.predict(sentence) model.predict([sentence, sentence_empty]) model.predict([sentence_empty]) # load model loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # chcek if model predicts correct label sentence = Sentence("this is Berlin") sentence_double = Sentence("this is Berlin and pizza") loaded_model.predict([sentence, sentence_double]) values = [] for label in sentence_double.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float values.append(label.value) assert "Berlin" in values assert "pizza" in values # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city") print(result.classification_report) assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def train(): corpus: Corpus = ClassificationCorpus(sst_folder, test_file='test.csv', dev_file='dev.csv', train_file='sst_dev.csv') label_dict = corpus.make_label_dictionary() stacked_embedding = WordEmbeddings('glove') # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(model_path, max_epochs=10, train_with_dev=False)
def predict(args): """Predict.""" model = TextClassifier.load(os.path.join(args.model_dir, args.model_file)) logger.info(f'Model: "{model}"') if args.one_per_line: corpus: Corpus = ClassificationCorpus( args.data_dir, test_file=args.test_file, ) else: assert args.label_symbol is not None corpus: Corpus = FlyClassificationCorpus( args.data_dir, test_file=args.test_file, comment_symbol=args.comment_symbol, label_symbol=args.label_symbol, ) fout = io.open(args.output_file, "w", encoding="utf-8", errors="ignore") logger.info("Saving to %s", args.output_file) start_time = time.time() for i in range(len(corpus.test)): sentence = corpus.test[i] model.predict(sentence) if sentence.labels: top = sentence.labels[0] fout.write(f"{top.value} {top.score:.4f}\n") fout.flush() logger.info("End of prediction: time %.1f min", (time.time() - start_time) / 60)
class ClassificationCorpusAnalysis(CorpusAnalysis): def __init__(self, path: Union[Path, str], column_name_map: dict = None, corpus: Corpus = None, **corpus_params): if isinstance(path, str): path = Path(path) assert path.exists() self.path = path if corpus: self.corpus = corpus else: if column_name_map: self.corpus = CSVClassificationCorpus(self.path, column_name_map, **corpus_params) else: self.corpus = ClassificationCorpus(self.path, **corpus_params) self.sentences = self.corpus.get_all_sentences() print(self.corpus) def class_distribution(self, multiclass: bool = False, nr_classes: int = 10, savefig_file=None, **kwargs): class_count = Corpus._get_class_to_count(self.sentences) class_count = pd.DataFrame.from_dict(class_count, orient='index', columns=['count']).sort_values( 'count', ascending=False) html_table = class_count.to_html() # plot distribution class_count_top = class_count[:nr_classes].copy() if not multiclass: if nr_classes < len(class_count): class_count_top.loc['others'] = class_count[nr_classes:].sum() # pie plot class_count class_count_top.plot.pie(y='count', **kwargs) plt.legend(labels=class_count_top.index, bbox_to_anchor=(1, 0, 0.1, 1), loc='center right') else: class_count_top.plot.bar(y='count', **kwargs) plt.gca().yaxis.grid(True, linestyle='--') plt.tight_layout() if savefig_file: plt.savefig(self.path / savefig_file, dpi=600) plt.show() def example_document_for_classes(self, ): # Todo! pass
def train_sentiment_model(rootdir, train, dev, test, num_epochs, device, outputdir): flair.device = torch.device(device) corpus = ClassificationCorpus(rootdir, train_file=train, dev_file=dev, test_file=test, in_memory=False) label_dict = corpus.make_label_dictionary() # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') optional_embedding = ELMoEmbeddings('original') word_embeddings = list(filter(None, [ optional_embedding, FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ])) # Initialize document embedding by passing list of word embeddings # # Note this will kick off model generation that will take a long time (several hours) # This will produce final-model.pt and best-model.pt files which represent a stored trained model. document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(outputdir, max_epochs=num_epochs)
def train(args): """Train.""" start_time = time.time() if args.one_per_line: corpus: Corpus = ClassificationCorpus( args.data_dir, train_file=args.train_file, dev_file=args.dev_file, ) else: assert args.label_symbol is not None corpus: Corpus = FlyClassificationCorpus( args.data_dir, train_file=args.train_file, dev_file=args.dev_file, comment_symbol=args.comment_symbol, label_symbol=args.label_symbol, ) label_dict = corpus.make_label_dictionary() vocab = corpus.make_vocab_dictionary().get_items() embeddings = utils.init_embeddings(vocab, args) document_embeddings = DocumentRNNEmbeddings( [embeddings], hidden_size=args.hidden_size, use_attn=args.use_attn, num_heads=args.num_heads, scaling=args.scaling, pooling_operation=args.pooling_operation, use_sent_query=args.use_sent_query, ) model = TextClassifier(document_embeddings, label_dictionary=label_dict) utils.init_model(model, args) trainer: ModelTrainer = ModelTrainer(model, corpus, utils.optim_method(args.optim)) trainer.train( args.model_dir, mini_batch_size=args.mini_batch_size, max_epochs=args.max_epochs, anneal_factor=args.anneal_factor, learning_rate=args.learning_rate, patience=args.patience, min_learning_rate=args.min_learning_rate, embeddings_storage_mode=args.embeddings_storage_mode, ) logger.info("End of training: time %.1f min", (time.time() - start_time) / 60)
def __init__(self, path: Union[Path, str], column_name_map: dict = None, corpus: Corpus = None, **corpus_params): if isinstance(path, str): path = Path(path) assert path.exists() self.path = path if corpus: self.corpus = corpus else: if column_name_map: self.corpus = CSVClassificationCorpus(self.path, column_name_map, **corpus_params) else: self.corpus = ClassificationCorpus(self.path, **corpus_params) self.sentences = self.corpus.get_all_sentences() print(self.corpus)
def _train_model(self): # type: () -> None corpus = ClassificationCorpus( Path(__path_to_base__), test_file=os.path.basename(self.path_to_test), dev_file=os.path.basename(self.path_to_dev), train_file=os.path.basename(self.path_to_train)) word_embeddings = [ ELMoEmbeddings('original'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(__path_to_base__, max_epochs=10)
def test_init_tars_and_switch(tasks_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb") # create a TARS classifier tars = TARSClassifier( task_name="2_CLASS", label_dictionary=corpus.make_label_dictionary(label_type="class"), label_type="class", ) # check if right number of classes assert len(tars.get_current_label_dictionary()) == 2 # switch to task with only one label tars.add_and_switch_to_new_task("1_CLASS", "one class", "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 1 # switch to task with three labels provided as list tars.add_and_switch_to_new_task("3_CLASS", ["list 1", "list 2", "list 3"], "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 3 # switch to task with four labels provided as set tars.add_and_switch_to_new_task("4_CLASS", {"set 1", "set 2", "set 3", "set 4"}, "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 4 # switch to task with two labels provided as Dictionary tars.add_and_switch_to_new_task("2_CLASS_AGAIN", corpus.make_label_dictionary(label_type="class"), "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 2
def load_corpus(): label_dictionary: Dictionary = Dictionary(add_unk=False) label_dictionary.multi_label = False label_dictionary.add_item('0') label_dictionary.add_item('1') # this is the folder in which train, test and dev files reside data_folder = 'datasets/constrained_classification/k16' # load corpus containing training, test and dev data corpus: Corpus = ClassificationCorpus(data_folder, dev_file='fasttext.valid', train_file='fasttext.train') return corpus, label_dictionary
test_data = utils.mgdb.read_mongo('raw_data_test') test_data.to_csv(path.join(data_folder, 'test.txt'), sep=' ', index=False, header=False, columns=['label', 'text']) dev_data = utils.mgdb.read_mongo('raw_data_dev') dev_data.to_csv(path.join(data_folder, 'dev.txt'), sep=' ', index=False, header=False, columns=['label', 'text']) #%% corpus: Corpus = ClassificationCorpus('data/splitted_data') if len(corpus.train) == 0 or len(corpus.test) == 0: raise Exception('Creating corpus failed') #%% word_embeddings = [WordEmbeddings('glove')] document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) label_dict = corpus.make_label_dictionary()
data_folder = 'content_folder' from flair.data import Corpus from flair.datasets import ClassificationCorpus corpus: Corpus = ClassificationCorpus(data_folder) from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings from flair.trainers import ModelTrainer from flair.models import TextClassifier label_dict = corpus.make_label_dictionary() word_embeddings = [WordEmbeddings('glove')] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus) trainer.train('/content/data', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150)
articles_train, train=True) tokens_valid, labels_valid = return_annotated_articles( params["dev_labels_file"], articles_valid, train=True) tokens_test, labels_test = return_annotated_articles( params["test_labels_file"], articles_test, train=True) write_to_file_flair_corpus(params['data_bert_format_dir'] + 'train.txt', tokens, labels) write_to_file_flair_corpus(params['data_bert_format_dir'] + 'dev.txt', tokens_valid, labels_valid) write_to_file_flair_corpus(params['data_bert_format_dir'] + 'test.txt', tokens_test, labels_test) # init a corpus using column format, data folder and the names of the train, dev and test files corpus = ClassificationCorpus(params['data_bert_format_dir'], train_file='train.txt', test_file='test.txt', dev_file='dev.txt') corpus.filter_empty_sentences() print(corpus) label_dictionary = corpus.make_label_dictionary() print(label_dictionary) flat_labels = [item for sublist in labels for item in sublist] class_weights = compute_class_weight('balanced', np.unique(flat_labels), flat_labels) unique_labels = np.unique(flat_labels) weights = {} for i in range(len(unique_labels)):
dev_data['preprocessed'] = tweet_preprocessing.preprocess_data( dev_data['content'], 'embedding') if B_TEST_PHASE is True: test_data['preprocessed'] = tweet_preprocessing.preprocess_data( test_data['content'], 'embedding') utils.csv2ftx(train_data.content, train_data.sentiment, S_DATASET, 'train', 'flair') utils.csv2ftx(dev_data.content, dev_data.sentiment, S_DATASET, 'dev', 'flair') utils.csv2ftx(test_data.content, test_data.sentiment, S_DATASET, 'test', 'flair') corpus = Corpus = ClassificationCorpus( '../dataset/flair/', train_file='intertass_{}_train.txt'.format(S_DATASET), dev_file='intertass_{}_dev.txt'.format(S_DATASET), test_file='intertass_{}_test.txt'.format(S_DATASET)) # class_weights = compute_class_weight('balanced', [0, 1, 2, 3], y=train_data.sentiment) # dict_weights = dict() # for i, label in enumerate(class_weights): # dict_weights.update({str(label): class_weights[i]}) # word_embeddings = [BertEmbeddings('bert-base-multilingual-cased')] word_embeddings = [ BertEmbeddings('dccuchile/bert-base-spanish-wwm-cased') ] document_embeddings = DocumentRNNEmbeddings( word_embeddings,