def train_LM(file_path, model_path, is_forward_lm=True): from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus dictionary = Dictionary.load_from_file(file_path + 'mappings') # get your corpus, process forward and at the character level corpus = TextCorpus(file_path, dictionary, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionary, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train(model_path, sequence_length=100, mini_batch_size=32, max_epochs=10)
def test_dictionary_save_and_load(): dictionary = Dictionary(add_unk=False) dictionary.add_item('class_1') dictionary.add_item('class_2') dictionary.add_item('class_3') file_path = 'dictionary.txt' dictionary.save(file_path) loaded_dictionary = dictionary.load_from_file(file_path) assert (len(dictionary) == len(loaded_dictionary)) assert (len(dictionary.get_items()) == len(loaded_dictionary.get_items())) os.remove(file_path)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--command", choices=["interpret"], required=True) parser.add_argument("--model_output_dirpath", default=None) parser.add_argument("--data_folder", default="./data") parser.add_argument("--device", default="gpu", choices=["cpu", "gpu"]) parser.add_argument("--loglevel", default="NOTSET", choices=["CRITICAL", "NOTSET", "INFO"]) args = parser.parse_args() command = args.command model_output_dirpath = args.model_output_dirpath try: tag_dictionary: Dictionary = Dictionary.load_from_file( os.path.join(model_output_dirpath, "tag_dictionary.pickle")) except FileNotFoundError: print("WARN: tag_dictionary is not found at %s" % os.path.join(model_output_dirpath, "tag_dictionary.pickle")) params = load_params(os.path.join(model_output_dirpath, "params.json")) tagger: SequenceTagger = load_model(model_output_dirpath) import sys line = sys.stdin.readline().strip() while line: sentence = Sentence(line, use_tokenizer=True) tagger.predict([sentence], mini_batch_size=1, verbose=True) # print(sentence) # print(sentence.to_tagged_string()) for token in sentence.tokens: tag = token.get_tag(tag_type) print(token.text, 'O', tag.value, tag.score) print() line = sys.stdin.readline().strip()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--command", choices=[ "hyperparameter_search", "find_learning_rate", "train", "resume_train", "evaluate", "evaluate_from_stdin", "evaluate_zor_cumleler" ], required=True) parser.add_argument("--embedding_type", choices=["bert", "flair", "char"]) parser.add_argument("--model_name", default="default_model_name") parser.add_argument("--bert_model_dirpath_or_name", default="bert-base-multilingual-cased") parser.add_argument("--model_output_dirpath", default=None) parser.add_argument("--other_test_file_for_evaluation", default=None) parser.add_argument("--optimizer", default="sgd", choices=["sgd", "adam", "adamw"]) parser.add_argument("--hidden_size", default=256, type=int) parser.add_argument("--dropout", default=0.5, type=float) parser.add_argument("--learning_rate", default=0.05, type=float) parser.add_argument("--max_epochs", default=10, type=int) parser.add_argument("--mini_batch_size", default=16, type=int) parser.add_argument("--data_folder", default="./data") parser.add_argument("--device", default="gpu", choices=["cpu", "gpu"]) parser.add_argument("--loglevel", default="NOTSET", choices=["CRITICAL", "NOTSET", "INFO"]) args = parser.parse_args() command = args.command embedding_type = args.embedding_type model_name = args.model_name bert_model_dirpath_or_name = args.bert_model_dirpath_or_name # "../outputs/bert_model/" model_output_dirpath = args.model_output_dirpath if model_output_dirpath is None and command != "evaluate": if not os.path.exists("./models"): os.mkdir("./models") model_output_dirpath = "./models/%s" % model_name if not os.path.exists(model_output_dirpath): os.mkdir(model_output_dirpath) other_test_file_for_evaluation = args.other_test_file_for_evaluation optimizer = args.optimizer hidden_size = args.hidden_size dropout = args.dropout learning_rate = args.learning_rate max_epochs = args.max_epochs mini_batch_size = args.mini_batch_size data_folder = args.data_folder device = args.device loglevel = args.loglevel log.setLevel(loglevel) if command in ["evaluate", "evaluate_from_stdin", "evaluate_zor_cumleler"]: log.setLevel("CRITICAL") if device != "gpu" and device == "cpu": flair.device = torch.device('cpu') if other_test_file_for_evaluation is None: corpus, tag_dictionary = load_standard_corpus(data_folder, tag_type) else: dirname = os.path.dirname(other_test_file_for_evaluation) corpus = load_specific_corpus( dirname, { "train_file": None, "dev_file": None, "test_file": os.path.basename(other_test_file_for_evaluation) }) # tag dictionary will be loaded from the model tag_dictionary = None assert corpus, "corpus should be loaded" params = { "model_name": model_name, "embedding_type": embedding_type, "tag_type": tag_type, "bert_model_dirpath_or_name": bert_model_dirpath_or_name, "model_output_dirpath": model_output_dirpath, "optimizer": optimizer, "hidden_size": hidden_size, "dropout": dropout, "learning_rate": learning_rate, "max_epochs": max_epochs, "mini_batch_size": mini_batch_size } if command in ["hyperparameter_search", "find_learning_rate"]: if not os.path.exists("hyperparameter_search"): print( "Creating the hyperparameter_search directory for hyperparameter selection process..." ) os.mkdir("hyperparameter_search") if command == "hyperparameter_search": select_hyperparameters(params, corpus) tagger, embeddings = create_model(params, tag_dictionary) trainer = create_trainer(tagger, corpus, optimizer=optimizers_dict[optimizer]) find_learning_rate(trainer, params) elif command == "train": tagger, embeddings = create_model(params, tag_dictionary) if not os.path.exists(params["model_output_dirpath"]): os.mkdir(params["model_output_dirpath"]) tag_dictionary.save( os.path.join(params["model_output_dirpath"], "tag_dictionary.pickle")) save_params( params, os.path.join(params["model_output_dirpath"], "params.json")) train(params, tagger, corpus) elif command == "resume_train": params = load_params(os.path.join(model_output_dirpath, "params.json")) train(params, None, corpus) elif command == "evaluate": try: tag_dictionary: Dictionary = Dictionary.load_from_file( os.path.join(model_output_dirpath, "tag_dictionary.pickle")) except FileNotFoundError: log.warning( "WARN: tag_dictionary is not found at %s" % os.path.join(model_output_dirpath, "tag_dictionary.pickle")) params = load_params(os.path.join(model_output_dirpath, "params.json")) tagger: SequenceTagger = load_model(model_output_dirpath) trainer = create_trainer(tagger, corpus) if other_test_file_for_evaluation: out_path = other_test_file_for_evaluation + ".evaluation" else: out_path = os.path.join(params["model_output_dirpath"], "evaluation.txt") metric, eval_loss = trainer.evaluate(tagger, corpus.test, eval_mini_batch_size=16, out_path=Path(out_path)) print(metric.to_tsv()) elif command == "evaluate_from_stdin": try: tag_dictionary: Dictionary = Dictionary.load_from_file( os.path.join(model_output_dirpath, "tag_dictionary.pickle")) except FileNotFoundError: print("WARN: tag_dictionary is not found at %s" % os.path.join(model_output_dirpath, "tag_dictionary.pickle")) params = load_params(os.path.join(model_output_dirpath, "params.json")) tagger: SequenceTagger = load_model(model_output_dirpath) import sys line = sys.stdin.readline().strip() while line: sentence = Sentence(line, use_tokenizer=True) tagger.predict([sentence], mini_batch_size=1, verbose=True) # print(sentence) # print(sentence.to_tagged_string()) for token in sentence.tokens: tag = token.get_tag(tag_type) print(token.text, 'O', tag.value, tag.score) print() line = sys.stdin.readline().strip() elif command == "evaluate_zor_cumleler": try: tag_dictionary: Dictionary = Dictionary.load_from_file( os.path.join(model_output_dirpath, "tag_dictionary.pickle")) except FileNotFoundError: log.warning( "WARN: tag_dictionary is not found at %s" % os.path.join(model_output_dirpath, "tag_dictionary.pickle")) params = load_params(os.path.join(model_output_dirpath, "params.json")) tagger: SequenceTagger = load_model(model_output_dirpath) metrics = {"TP": 0, "FP": 0, "TN": 0, "FN": 0} with open("data/zor_cumleler.txt") as f: line = f.readline().strip() while line: sentence = Sentence(line, use_tokenizer=True) tagger.predict([sentence], mini_batch_size=1, verbose=False) if len(sentence.get_spans(tag_type)) > 0: metrics["TP"] += 1 else: metrics["FN"] += 1 line = f.readline().strip() print(metrics)
from flair.data import Dictionary from flair.models import LanguageModel from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus import pickle # are you train a forward or backward LM? is_forward_lm = True dictionaty = Dictionary.load_from_file('/home/anna/Desktop/markup/learning/dictionary/dict') # get your corpus, process forward and at the character level corpus = TextCorpus('/home/anna/Desktop/markup/learning', dictionaty, is_forward_lm, character_level=True) # instantiate your language model, set hidden size and number of layers language_model = LanguageModel(dictionaty, is_forward_lm, hidden_size=128, nlayers=1) # train your language model trainer = LanguageModelTrainer(language_model, corpus) trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)