コード例 #1
0
ファイル: train_LM.py プロジェクト: enp-china/CCSR-NER
def train_LM(file_path, model_path, is_forward_lm=True):
    from flair.data import Dictionary
    from flair.models import LanguageModel
    from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

    dictionary = Dictionary.load_from_file(file_path + 'mappings')

    # get your corpus, process forward and at the character level
    corpus = TextCorpus(file_path,
                        dictionary,
                        is_forward_lm,
                        character_level=True)

    # instantiate your language model, set hidden size and number of layers
    language_model = LanguageModel(dictionary,
                                   is_forward_lm,
                                   hidden_size=128,
                                   nlayers=1)

    # train your language model
    trainer = LanguageModelTrainer(language_model, corpus)

    trainer.train(model_path,
                  sequence_length=100,
                  mini_batch_size=32,
                  max_epochs=10)
コード例 #2
0
ファイル: test_data.py プロジェクト: azawalich/flair
def test_dictionary_save_and_load():
    dictionary = Dictionary(add_unk=False)
    dictionary.add_item('class_1')
    dictionary.add_item('class_2')
    dictionary.add_item('class_3')
    file_path = 'dictionary.txt'
    dictionary.save(file_path)
    loaded_dictionary = dictionary.load_from_file(file_path)
    assert (len(dictionary) == len(loaded_dictionary))
    assert (len(dictionary.get_items()) == len(loaded_dictionary.get_items()))
    os.remove(file_path)
コード例 #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--command", choices=["interpret"], required=True)

    parser.add_argument("--model_output_dirpath", default=None)

    parser.add_argument("--data_folder", default="./data")

    parser.add_argument("--device", default="gpu", choices=["cpu", "gpu"])
    parser.add_argument("--loglevel",
                        default="NOTSET",
                        choices=["CRITICAL", "NOTSET", "INFO"])

    args = parser.parse_args()

    command = args.command

    model_output_dirpath = args.model_output_dirpath

    try:
        tag_dictionary: Dictionary = Dictionary.load_from_file(
            os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
    except FileNotFoundError:
        print("WARN: tag_dictionary is not found at %s" %
              os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
    params = load_params(os.path.join(model_output_dirpath, "params.json"))
    tagger: SequenceTagger = load_model(model_output_dirpath)

    import sys

    line = sys.stdin.readline().strip()
    while line:
        sentence = Sentence(line, use_tokenizer=True)
        tagger.predict([sentence], mini_batch_size=1, verbose=True)
        # print(sentence)
        # print(sentence.to_tagged_string())
        for token in sentence.tokens:
            tag = token.get_tag(tag_type)
            print(token.text, 'O', tag.value, tag.score)
        print()
        line = sys.stdin.readline().strip()
コード例 #4
0
ファイル: train.py プロジェクト: haozturk/kanarya
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--command",
                        choices=[
                            "hyperparameter_search", "find_learning_rate",
                            "train", "resume_train", "evaluate",
                            "evaluate_from_stdin", "evaluate_zor_cumleler"
                        ],
                        required=True)
    parser.add_argument("--embedding_type", choices=["bert", "flair", "char"])
    parser.add_argument("--model_name", default="default_model_name")
    parser.add_argument("--bert_model_dirpath_or_name",
                        default="bert-base-multilingual-cased")
    parser.add_argument("--model_output_dirpath", default=None)
    parser.add_argument("--other_test_file_for_evaluation", default=None)
    parser.add_argument("--optimizer",
                        default="sgd",
                        choices=["sgd", "adam", "adamw"])
    parser.add_argument("--hidden_size", default=256, type=int)
    parser.add_argument("--dropout", default=0.5, type=float)
    parser.add_argument("--learning_rate", default=0.05, type=float)
    parser.add_argument("--max_epochs", default=10, type=int)
    parser.add_argument("--mini_batch_size", default=16, type=int)

    parser.add_argument("--data_folder", default="./data")

    parser.add_argument("--device", default="gpu", choices=["cpu", "gpu"])
    parser.add_argument("--loglevel",
                        default="NOTSET",
                        choices=["CRITICAL", "NOTSET", "INFO"])

    args = parser.parse_args()

    command = args.command
    embedding_type = args.embedding_type
    model_name = args.model_name
    bert_model_dirpath_or_name = args.bert_model_dirpath_or_name  # "../outputs/bert_model/"

    model_output_dirpath = args.model_output_dirpath
    if model_output_dirpath is None and command != "evaluate":
        if not os.path.exists("./models"):
            os.mkdir("./models")
        model_output_dirpath = "./models/%s" % model_name
        if not os.path.exists(model_output_dirpath):
            os.mkdir(model_output_dirpath)

    other_test_file_for_evaluation = args.other_test_file_for_evaluation

    optimizer = args.optimizer
    hidden_size = args.hidden_size
    dropout = args.dropout
    learning_rate = args.learning_rate
    max_epochs = args.max_epochs
    mini_batch_size = args.mini_batch_size
    data_folder = args.data_folder
    device = args.device
    loglevel = args.loglevel

    log.setLevel(loglevel)
    if command in ["evaluate", "evaluate_from_stdin", "evaluate_zor_cumleler"]:
        log.setLevel("CRITICAL")

    if device != "gpu" and device == "cpu":
        flair.device = torch.device('cpu')

    if other_test_file_for_evaluation is None:
        corpus, tag_dictionary = load_standard_corpus(data_folder, tag_type)
    else:
        dirname = os.path.dirname(other_test_file_for_evaluation)
        corpus = load_specific_corpus(
            dirname, {
                "train_file": None,
                "dev_file": None,
                "test_file": os.path.basename(other_test_file_for_evaluation)
            })
        # tag dictionary will be loaded from the model
        tag_dictionary = None

    assert corpus, "corpus should be loaded"

    params = {
        "model_name": model_name,
        "embedding_type": embedding_type,
        "tag_type": tag_type,
        "bert_model_dirpath_or_name": bert_model_dirpath_or_name,
        "model_output_dirpath": model_output_dirpath,
        "optimizer": optimizer,
        "hidden_size": hidden_size,
        "dropout": dropout,
        "learning_rate": learning_rate,
        "max_epochs": max_epochs,
        "mini_batch_size": mini_batch_size
    }

    if command in ["hyperparameter_search", "find_learning_rate"]:

        if not os.path.exists("hyperparameter_search"):
            print(
                "Creating the hyperparameter_search directory for hyperparameter selection process..."
            )
            os.mkdir("hyperparameter_search")

        if command == "hyperparameter_search":
            select_hyperparameters(params, corpus)

        tagger, embeddings = create_model(params, tag_dictionary)

        trainer = create_trainer(tagger,
                                 corpus,
                                 optimizer=optimizers_dict[optimizer])

        find_learning_rate(trainer, params)

    elif command == "train":
        tagger, embeddings = create_model(params, tag_dictionary)

        if not os.path.exists(params["model_output_dirpath"]):
            os.mkdir(params["model_output_dirpath"])

        tag_dictionary.save(
            os.path.join(params["model_output_dirpath"],
                         "tag_dictionary.pickle"))

        save_params(
            params, os.path.join(params["model_output_dirpath"],
                                 "params.json"))

        train(params, tagger, corpus)
    elif command == "resume_train":
        params = load_params(os.path.join(model_output_dirpath, "params.json"))
        train(params, None, corpus)
    elif command == "evaluate":
        try:
            tag_dictionary: Dictionary = Dictionary.load_from_file(
                os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
        except FileNotFoundError:
            log.warning(
                "WARN: tag_dictionary is not found at %s" %
                os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
        params = load_params(os.path.join(model_output_dirpath, "params.json"))
        tagger: SequenceTagger = load_model(model_output_dirpath)

        trainer = create_trainer(tagger, corpus)
        if other_test_file_for_evaluation:
            out_path = other_test_file_for_evaluation + ".evaluation"
        else:
            out_path = os.path.join(params["model_output_dirpath"],
                                    "evaluation.txt")
        metric, eval_loss = trainer.evaluate(tagger,
                                             corpus.test,
                                             eval_mini_batch_size=16,
                                             out_path=Path(out_path))

        print(metric.to_tsv())
    elif command == "evaluate_from_stdin":
        try:
            tag_dictionary: Dictionary = Dictionary.load_from_file(
                os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
        except FileNotFoundError:
            print("WARN: tag_dictionary is not found at %s" %
                  os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
        params = load_params(os.path.join(model_output_dirpath, "params.json"))
        tagger: SequenceTagger = load_model(model_output_dirpath)

        import sys
        line = sys.stdin.readline().strip()
        while line:
            sentence = Sentence(line, use_tokenizer=True)
            tagger.predict([sentence], mini_batch_size=1, verbose=True)
            # print(sentence)
            # print(sentence.to_tagged_string())
            for token in sentence.tokens:
                tag = token.get_tag(tag_type)
                print(token.text, 'O', tag.value, tag.score)
            print()
            line = sys.stdin.readline().strip()

    elif command == "evaluate_zor_cumleler":
        try:
            tag_dictionary: Dictionary = Dictionary.load_from_file(
                os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
        except FileNotFoundError:
            log.warning(
                "WARN: tag_dictionary is not found at %s" %
                os.path.join(model_output_dirpath, "tag_dictionary.pickle"))
        params = load_params(os.path.join(model_output_dirpath, "params.json"))
        tagger: SequenceTagger = load_model(model_output_dirpath)

        metrics = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}

        with open("data/zor_cumleler.txt") as f:
            line = f.readline().strip()
            while line:
                sentence = Sentence(line, use_tokenizer=True)
                tagger.predict([sentence], mini_batch_size=1, verbose=False)
                if len(sentence.get_spans(tag_type)) > 0:
                    metrics["TP"] += 1
                else:
                    metrics["FN"] += 1
                line = f.readline().strip()
        print(metrics)
コード例 #5
0
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
import pickle

# are you train a forward or backward LM?
is_forward_lm = True

dictionaty = Dictionary.load_from_file('/home/anna/Desktop/markup/learning/dictionary/dict')

# get your corpus, process forward and at the character level
corpus = TextCorpus('/home/anna/Desktop/markup/learning', dictionaty, is_forward_lm, character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionaty, is_forward_lm, hidden_size=128, nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model', sequence_length=10, mini_batch_size=10, max_epochs=10)