def test_train_charlm__nocache_load_use_classifier(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file('./results/final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree('./results')
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, test_mode=True) trainer.train(str(results_base_path), max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def train(data_dir: str, model_dir: str, dataset_format: str='macss', num_filters: int=150, word_embeddings: str='de-fasttext', offset_embedding_dim: int=50, learning_rate: float=.1, batch_size: int=32, max_epochs: int=50, dropout: float=.5, use_char_embeddings: bool=False, seed: int=0, dev_size: float=.1, test_size: float=.2): logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') logging.info(f'Training config: {locals().items()}') if dataset_format not in ['macss', 'semeval']: raise ValueError(f"Dataset format '{dataset_format}' not supported.") corpus: TaggedCorpus = dataset_loader[dataset_format](data_dir, dev_size, seed) label_dictionary = corpus.make_label_dictionary() logging.info(f'Corpus: {corpus}') corpus.print_statistics() logging.info(f'Size of label dictionary: {len(label_dictionary)}') logging.info(f'Labels: {label_dictionary.get_items()}') embedding_types: List[TokenEmbeddings] = [ WordEmbeddings(word_embeddings), RelativeOffsetEmbeddings('offset_e1', max_len=200, embedding_dim=offset_embedding_dim), RelativeOffsetEmbeddings('offset_e2', max_len=200, embedding_dim=offset_embedding_dim), ] if use_char_embeddings: embedding_types += CharacterEmbeddings() document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings(embedding_types, num_filters=num_filters, dropout=dropout) classifier: TextClassifier = TextClassifier(document_embeddings=document_embeddings, label_dictionary=label_dictionary, multi_label=False) trainer: TextClassifierTrainer = TextClassifierTrainer(classifier, corpus, label_dictionary) trainer.train(model_dir, learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=max_epochs)
def train(data_dir: str, model_dir: str, dataset_format: str = 'macss_tdt', num_filters: int = 150, word_embeddings: str = 'de-fasttext', offset_embedding_dim: int = 100, learning_rate: float = .1, batch_size: int = 32, max_epochs: int = 1, dropout: float = .5, use_char_embeddings: bool = False, seed: int = 0, dev_size: float = .1, test_size: float = .2, concept_embedding_dim: int = 100): all_data = open('all_data.txt', encoding='utf8').read().split("\n") test_dev_percent = math.floor((len(all_data) * 25) / 100) k_folds = math.floor(len(all_data) / test_dev_percent) random.shuffle(all_data) config_name = '1_Some_Setting_Name' for i in range(k_folds): data_path = 'resources/' + config_name + '/' + str(i + 1) test_dev_set = all_data[(test_dev_percent * (i + 1)) - test_dev_percent:test_dev_percent * (i + 1)] train = all_data[0:(test_dev_percent * (i + 1)) - test_dev_percent] + all_data[test_dev_percent * (i + 1):len(all_data)] random.shuffle(test_dev_set) test_perc = math.floor((len(test_dev_set) * 60) / 100) test = test_dev_set[0:test_perc] dev = test_dev_set[test_perc:len(test_dev_set)] os.makedirs(data_path, exist_ok=True) train_txt = open(data_path + '/train.txt', 'w+') test_txt = open(data_path + '/test.txt', 'w+') dev_txt = open(data_path + '/dev.txt', 'w+') os.system('cp -r ./Data/vocabulary/ ' + data_path) train_txt.write('\n'.join(train)) test_txt.write('\n'.join(test)) dev_txt.write('\n'.join(dev)) train_txt.close() test_txt.close() dev_txt.close() #print("Train Directory: ", data_dir, dev_size, seed, "\n") logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') if dataset_format not in ['macss_tdt']: raise ValueError( f"Dataset format '{dataset_format}' not supported.") corpus: TaggedCorpus = dataset_loader[dataset_format](data_path, 'train.txt', 'dev.txt', 'test.txt') label_dictionary = corpus.make_label_dictionary() # rel-type # Comment out the embeddings that you don't need embedding_types: List[TokenEmbeddings] = [ # mEx Fine-Tuned Word Embeddings #WordEmbeddings('../../Resources/mex-ft-wiki-de-finetuned-biomedical.gensim'), # Default German FastText Word Embeddings #WordEmbeddings('../../Resources/ft-wiki-de.gensim'), # Relative Offset Embeddings RelativeOffsetEmbeddings('offset_e1', max_len=200, embedding_dim=offset_embedding_dim), RelativeOffsetEmbeddings('offset_e2', max_len=200, embedding_dim=offset_embedding_dim), # Concept Embeddings ConceptEmbeddings('concept_1', max_len=200, embedding_dim=concept_embedding_dim), ConceptEmbeddings('concept_2', max_len=200, embedding_dim=concept_embedding_dim), ] if use_char_embeddings: embedding_types += CharacterEmbeddings() document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings( embedding_types, num_filters=num_filters, dropout=dropout) classifier: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dictionary, multi_label=False) trainer: TextClassifierTrainer = TextClassifierTrainer( classifier, corpus, label_dictionary) trainer.train(data_path, learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=3, use_tensorboard=False, embeddings_in_memory=False)