Beispiel #1
0
def build_conll03en_corpus(base_path: str):
    document_as_sequence = False
    corpus = ColumnCorpus(
        base_path,
        column_format={0: "text", 1: "pos", 2: "np", 3: "ner"},
        train_file="train.txt",
        dev_file="dev.txt",
        test_file="test.txt",
        tag_to_bioes="ner",
        document_separator_token=None if not document_as_sequence else "-DOCSTART-",
    )
    tag_type = "ner"
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    return corpus, tag_type, tag_dictionary
def train_ner(device_category):
    """
    Training the sequence labeling model
    """
    columns = {0: 'text', 1: 'ner'}

    training_file = os.path.join(
        root_path, 'part_extraction/data/{}.conll'.format(device_category))
    data_folder = os.path.join(root_path, 'part_extraction/data')

    corpus = ColumnCorpus(data_folder, columns, train_file=training_file)

    print(len(corpus.train))
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    embedding_types = [
        WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        # CharacterEmbeddings(),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),
    ]

    embeddings = StackedEmbeddings(embeddings=embedding_types)

    tagger = SequenceTagger(hidden_size=256,
                            embeddings=embeddings,
                            tag_dictionary=tag_dictionary,
                            tag_type=tag_type,
                            use_crf=True)

    trainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train(ner_models,
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=150)

    trainer.model.save('{}/{}.pt'.format(ner_models, device_category))
    def handle(self, *args, **options):
        file = options.get('file') or 'annotated_sentences'
        model_folder = options.get('model_folder') or 'model-var'
        columns = {0: 'text', 1: 'var'}
        data_folder = 'data/txt'

        corpus = ColumnCorpus(data_folder, columns,
                              train_file=f'{file}.txt')
        
        tag_type = 'var'

        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        embedding_types = [
            WordEmbeddings('glove'),

            # comment in this line to use character embeddings
            # CharacterEmbeddings(),

            # comment in these lines to use flair embeddings
            # FlairEmbeddings('news-forward'),
            # FlairEmbeddings('news-backward'),
            TransformerWordEmbeddings('bert-base-uncased'),
        ]

        embeddings = StackedEmbeddings(embeddings=embedding_types)

        tagger = SequenceTagger(hidden_size=256,
                                embeddings=embeddings,
                                tag_dictionary=tag_dictionary,
                                tag_type=tag_type,
                                use_crf=True)

        trainer = ModelTrainer(tagger, corpus)

        trainer.train(f'data/models/taggers/{model_folder}',
                    learning_rate=0.1,
                    mini_batch_size=32,
                    max_epochs=150)

        
        self.stdout.write(self.style.SUCCESS(f'Successfully trained model on dataset file.'))
Beispiel #4
0
class ner_trainer(object):
    def __init__(self, dataFolder, trainFile):
        self.tag_type = 'ner'
        self.dataFolder = dataFolder
        self.trainFile = trainFile
        self.columns = {0: 'text', 1: 'pos', 2: 'empty', 3: 'ner'}
        pass

    def LoadConll03(self, dataFolder, trainFile, testFile=None, devFile=None):
        self.corpus = ColumnCorpus(dataFolder, self.columns, trainFile,
                                   testFile, devFile)
        self.tag_dictionary = self.corpus.make_tag_dictionary(
            tag_type=self.tag_type)
        #print(self.corpus.train[0].to_tagged_string('ner'))
        print(self.tag_dictionary.idx2item)

    def LoadEmbeddings(self, emedding_types):
        self.embedding_types = emedding_types
        self.embeddings = StackedEmbeddings(embeddings=self.embedding_types)

    def train(self,
              model_path,
              learning_rate=0.1,
              batch_size=32,
              epochs=150,
              hidden_size=256,
              use_crf=True):
        self.tagger = SequenceTagger(hidden_size=hidden_size,
                                     embeddings=self.embeddings,
                                     tag_dictionary=self.tag_dictionary,
                                     tag_type=self.tag_type,
                                     use_crf=use_crf)

        self.trainer = ModelTrainer(self.tagger, self.corpus)

        self.trainer.train(model_path,
                           learning_rate=learning_rate,
                           mini_batch_size=batch_size,
                           max_epochs=epochs)
def train(args, tag_type):
    '''
    Training script to be run for training the ner model

    Parameters:
    -----------
    args:arguments passed to the parser on CLI
    '''
    data_dir = args.input_dir + '/data'
    corpus = ColumnCorpus(data_folder=data_dir,
                          column_format={
                              0: 'text',
                              1: 'ner'
                          },
                          train_file=args.train_file,
                          test_file=args.test_file,
                          dev_file=args.dev_file)

    # print(corpus.train[0])
    # print(corpus)

    # tag_type = 'ner'

    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    # print(tag_dictionary)

    if args.character_embeddings:
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            CharacterEmbeddings(),
            FlairEmbeddings(args.flair_model_name_or_path_forward),
            FlairEmbeddings(args.flair_model_name_or_path_backward),
        ]
    else:
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            FlairEmbeddings(args.flair_model_name_or_path_forward),
            FlairEmbeddings(args.flair_model_name_or_path_backward),
        ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # initialize sequence tagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    if (args.train_or_predict == "continue_train"):
        print("continue training")
        checkpoint = '/Users/titashneogi/workspace/NLP/NER/data/flair/cumulative_model/checkpoint.pt'
        trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

    # start training
    trainer.train(args.model_dir,
                  learning_rate=args.train_learning_rate,
                  mini_batch_size=args.per_gpu_batch_size,
                  max_epochs=args.num_train_epochs,
                  embeddings_storage_mode=args.embeddings_storage_mode)

    model = SequenceTagger.load(args.model_dir + '/final-model.pt')
    if (args.predict_file):
        with open(data_dir + args.predict_file, 'r') as f:
            str_file = f.read()

        sentence = Sentence(str_file)

        model.predict(sentence)
        print(sentence.to_tagged_string())
Beispiel #6
0
def run_experiment(seed, batch_size, epoch, learning_rate, json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    
    hf_model = json_config["hf_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False
    task_name = json_config["task_name"]

    # Dataset-related
    data_folder = json_config["data_folder"]
    train_file     = json_config["train_file"]
    dev_file       = json_config["dev_file"]
    test_file      = json_config["test_file"]

    # Set seed for reproducibility
    set_seed(seed)

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    # Configuration
    column_format = {0: "text", 1: "ner"}

    # Corpus
    corpus = ColumnCorpus(data_folder=data_folder,
                          column_format=column_format,
                          train_file=train_file,
                          dev_file=dev_file,
                          test_file=test_file,
                          tag_to_bioes="ner",
                         )

    # Corpus configuration
    tag_type = "ner"
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # Embeddings
    embeddings = TransformerWordEmbeddings(
        model=hf_model,
        layers=layers,
        subtoken_pooling="first",
        fine_tune=True,
        use_context=context_size,
    )

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
        use_crf=use_crf,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.fine_tune(
        f"histo-flert-fine-tuning-{task_name}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )
class NER_experiment:
    def __init__(self, dataset_name, output_path="/kaggle/working"):

        input_path = f"/kaggle/input/{dataset_name}"

        self.tag_type = "ner"
        self.corpus = ColumnCorpus(
            data_folder=input_path, column_format={0: "text", 1: "ner"}
        )

        self.tag_dictionary = self.corpus.make_tag_dictionary(tag_type=self.tag_type)

        self.dataset_name = dataset_name
        self.output_path = output_path

    def build_embedding(self, lang, embedding_codes: List[str]) -> None:

        self.tic = time.time()
        self.embedding_name: str = "-".join(embedding_codes)
        self.lang = lang

        embedding_types: List[TokenEmbeddings] = []

        for code in embedding_codes:

            code = code.lower()
            assert code in [
                "bpe",
                "bert",
                "flair",
                "ft",
                "char",
                "ohe",
                "elmo",
            ], f"{code} - Invalid embedding code"

            if code == "ohe":
                embedding_types.append(OneHotEmbeddings(corpus=self.corpus))
            elif code == "ft":
                embedding_types.append(WordEmbeddings(self.lang))
            elif code == "bpe":
                embedding_types.append(BytePairEmbeddings(self.lang))
            elif code == "bert":
                embedding_types.append(
                    TransformerWordEmbeddings(
                        model=self.huggingface_ref[self.lang],
                        pooling_operation="first",
                        layers="-1",
                        fine_tune=False,
                    )
                )
            elif code == "char":
                embedding_types.append(CharacterEmbeddings())
            elif code == "flair":
                embedding_types.append(FlairEmbeddings(f"{self.lang}-forward"))
                embedding_types.append(FlairEmbeddings(f"{self.lang}-backward"))
            elif code == "elmo":
                embedding_types.append(
                    ELMoEmbeddings(model="large", embedding_mode="all")
                )

        self.embedding: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types
        )

        self.tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=self.embedding,
            tag_dictionary=self.tag_dictionary,
            tag_type=self.tag_type,
            use_crf=True,
        )

        self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)

    def train(self, max_epochs=100, storage_mode="cpu") -> None:

        self.trainer.train(
            base_path=self.output_path,
            learning_rate=0.1,
            mini_batch_size=32,
            max_epochs=max_epochs,
            embeddings_storage_mode=storage_mode,
        )

    def output_results(self) -> None:

        # plot training history
        self.history = pd.read_csv(f"{self.output_path}/loss.tsv", sep="\t")
        self._plot_history()

        dest = self.output_path + "/results.csv"

        out = [
            self.dataset_name,
            self.embedding_name,
            self.lang,
            self.history.EPOCH.iloc[-1],
            round(time.time() - self.tic),
        ]

        out.extend([round(x, 4) for x in self._extract_from_log().values()])
        out = [str(x) for x in out]

        out_names = [
            "dataset",
            "embedding",
            "lang",
            "epochs",
            "duration",
            "precision",
            "recall",
            "accuracy",
            "f1",
        ]

        # write header to file
        if not os.path.exists(dest):
            with open(dest, "w") as f:
                print(",".join(out_names), file=f)
        # write results to file
        with open(dest, "a") as f:
            print(",".join(out), file=f)

    def _extract_from_log(self) -> Dict[str, int]:

        log: List[str] = open(self.output_path + "/training.log", "r").readlines()[
            -5:-1
        ]
        out = {m: [] for m in ["precision", "recall", "accuracy", "f1"]}
        weights = []

        for entity_class in log:

            label, *result_str = entity_class.split()
            results = [float(result_str[i]) for i in range(1, 23, 3)]

            weights.append(
                np.sum(results[:4])
            )  # tp, fp, fn, tn (ignore)  -->  count instances/class

            for value, k in zip(results[4:], out):  # pr, rec, acc, f1
                out[k].append(value)

        # micro-average precision, recall, acc (ignore), F1
        return {m: np.average(v, weights=weights) for m, v in out.items()}

    def _plot_history(self) -> None:

        fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(9, 8))
        self.history.plot(
            "EPOCH", ["TRAIN_LOSS", "DEV_LOSS", "LEARNING_RATE"], ax=axes[0]
        )
        self.history.plot(
            "EPOCH", ["DEV_RECALL", "DEV_PRECISION", "DEV_F1"], ax=axes[1]
        )
        fig.savefig(f"{self.output_path}/{self.dataset_name}_{self.embedding_name}.png")

    # wrapper
    def run(self, lang, emb, max_epochs, storage):

        self.build_embedding(lang, emb)
        self.train(max_epochs, storage)
        self.output_results()

    # https://huggingface.co/models references
    huggingface_ref = {
        "fr": "camembert-base",
        "nl": "wietsedv/bert-base-dutch-cased",
        "en": "bert-base-cased",
        "multi": "bert-base-multilingual-cased",
    }
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings, CharacterEmbeddings, FlairEmbeddings
from typing import List
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

columns = {0: "text", 1: "pos", 2: "ner"}
data_folder = "data/causenet/"

corpus = ColumnCorpus(data_folder,
                      columns,
                      train_file="train.txt",
                      test_file="test.txt",
                      dev_file="val.txt")

tag_type = "ner"
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

embedding_types = [
    CharacterEmbeddings(),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(hidden_size=128,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        dropout=0.25,
                                        rnn_layers=4,
Beispiel #9
0
    f"The training corpus contains {len(corpus.train)} (pretty long) sample sentences."
)
print(
    f"The validation corpus contains {len(corpus.dev)} (pretty long) sample sentences."
)
print(
    f"The testing corpus contains {len(corpus.test)} (pretty long) sample sentences."
)

hidden_size = 128
embeddings = StackedEmbeddings(embeddings=[
    WordEmbeddings("es"),
    FlairEmbeddings("es-forward"),
    FlairEmbeddings("es-backward")
])  # TransformerWordEmbeddings()
dictionary = corpus.make_tag_dictionary(task)
tagger = SequenceTagger(hidden_size, embeddings, dictionary, task)

trainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.AdamW)
trainer.train(
    base_path=f"taggers/{task}-stacked",
    train_with_dev=False,
    max_epochs=30,
    learning_rate=0.001,
    mini_batch_size=32,
    weight_decay=0.,
    embeddings_storage_mode="none",
    scheduler=OneCycleLR,
)

# trainer = ModelTrainer(tagger, corpus)
def run_experiment(data_folder, task_name, model_name, run_id, use_context):
    # Set seed for reproducibility
    set_seed(int(run_id))

    if use_context == 0:
        use_context = False

    print("FLERT Context:", use_context)

    if task_name in ["lft", "onb"]:

        # Configuration
        column_format = {0: "token", 1: "ner"}

        # We use official data from Riedl and Padó
        train_file = f"enp_DE.{task_name}.mr.tok.train.bio"
        dev_file = f"enp_DE.{task_name}.mr.tok.dev.bio"
        test_file = f"enp_DE.{task_name}.mr.tok.test.bio"

        # Corpus
        corpus = ColumnCorpus(
            data_folder=data_folder,
            column_format=column_format,
            train_file=train_file,
            dev_file=dev_file,
            test_file=test_file,
            tag_to_bioes="ner",
        )

    # Corpus configuration
    tag_type = "ner"
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # Embeddings
    embedding_types: List[TokenEmbeddings] = [
        TransformerWordEmbeddings(model=model_name,
                                  layers="all",
                                  layer_mean=True,
                                  use_context=use_context)
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
        use_crf=True,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        f"resources/taggers/ner-{task_name}-{model_name}-context{use_context}-{run_id}",
        learning_rate=0.1,
        mini_batch_size=16,
        max_epochs=200,
        shuffle=True,
    )
Beispiel #11
0
def run_experiments(input_dir: Path, output_dir: Path):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus = ColumnCorpus(
        input_dir,
        {0: 'text', 1: 'dep', 2: 'aspect'},
        train_file='Laptops_poria-train.conll',
        # train_file='Restaurants_poria-train.conll',
        test_file='Laptops_poria-test.conll',
        # test_file='Restaurants_poria-test.conll',
        dev_file='Laptops_poria-train.conll'
        # dev_file='Restaurants_poria-train.conll'
    )

    # 2. what tag do we want to predict?
    tag_type = 'aspect'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    all_embedding_to_test = {
        # 'glove+aspects': [
        #     WordEmbeddings('glove'),
        #     WordEmbeddings(
        #         (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix()
        #     ),
        # ],
        # 'glove': [
        #     WordEmbeddings('glove'),
        # ],
        # 'charlmembedding': [
        #     FlairEmbeddings('news-forward'),
        #     FlairEmbeddings('news-backward'),
        # ],
        # 'glove-simple-char': [
        #     WordEmbeddings('glove'),
        #     CharacterEmbeddings(),
        # ],
        'bert+aspects': [
            BertEmbeddings('bert-large-cased'),
            WordEmbeddings(
                (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix()
            )
        ],
        'bert': [
            BertEmbeddings('bert-large-cased'),
        ],
        # 'elmo': [
        #     ELMoEmbeddings('original')
        # ]
    }

    for name, embeddings_to_stack in tqdm(
            all_embedding_to_test.items(),
            desc='Different embeddings stacked',
            total=len(all_embedding_to_test)
    ):
        results_folder = Path(DEFAULT_OUTPUT_PATH / f'sequence-tagging/aspects/laptops-{name}')
        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings_to_stack)

        # 5. initialize sequence tagger
        tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=tag_type,
            use_crf=True
        )

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        # 7. start training
        trainer.train(
            results_folder.as_posix(),
            learning_rate=0.1,
            mini_batch_size=32,
            max_epochs=150
        )

        # 8. plot training curves (optional)
        plotter = Plotter()
        plotter.plot_training_curves(results_folder / 'loss.tsv')
        plotter.plot_weights(results_folder / 'weights.txt')
Beispiel #12
0
    def train_all(self):
        config_file = open(self.config, "r")
        if self.config.split('.')[-1] == "yml":
            datastore = yaml.load(config_file)
        elif self.config.split('.')[-1] == "json":
            datastore = json.loads(config_file.read())
        else:
            print("Need a json or yaml file as config")
            sys.exit(0)

        columns = {
            int(datastore["dataset_reader"]["position_text"]): "text",
            int(datastore["dataset_reader"]["position_ner"]): "ner",
        }

        # focus_on = datastore["dataset_reader"]["focus_on"]

        if bool(datastore["dataset_reader"]["only_train"]):

            all_corpus = []
            log.info("Reading data from {}".format(datastore["dataset_reader"]["data_folder"]))

            all_corpus = ColumnCorpusTrain(
                datastore["dataset_reader"]["data_folder"],
                columns,
                train_file=datastore["dataset_reader"]["train_name"],
            )

            tag_type = "ner"
            tag_dictionary = all_corpus[0].make_tag_dictionary(tag_type=tag_type)

        else:

            iobes_corpus = ColumnCorpus(
                datastore["dataset_reader"]["data_folder"],
                columns,
                train_file=datastore["dataset_reader"]["train_name"],
                dev_file=datastore["dataset_reader"]["dev_name"],
                test_file=datastore["dataset_reader"]["test_name"],
            )

            tag_type = "ner"
            tag_dictionary = iobes_corpus.make_tag_dictionary(tag_type=tag_type)

            try:
                train_ratio = float(datastore["dataset_reader"]["train_ratio"])
                iobes_corpus = Corpus(iobes_corpus.train[0:int(len(iobes_corpus.train) * train_ratio)],
                                      iobes_corpus.dev, iobes_corpus.test)
                log_ratio = "Using only ", str(train_ratio * 100), "% of the train dataset"
                log.info(log_ratio)
            except:
                pass

        embed_list = []
        word_char = []
        char_word = []
        for embed in datastore["embeddings"]["embeddings_list"]:

            if embed == "bpe":
                embed_list.append(BytePairEmbeddings(datastore["embeddings"]["lang"]))
            elif embed == "fasttext":
                embed_list.append(WordEmbeddings(datastore["embeddings"]["lang"]))
            elif embed == "flair" and datastore["embeddings"]["lang"] == "en":
                embed_list.append(FlairEmbeddings("news-forward"))
                embed_list.append(FlairEmbeddings("news-backward"))
            elif embed == "bert-base-uncased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-base-uncased"))
            elif embed == "bert-base-cased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-base-cased"))
            elif embed == "bert-large-uncased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-large-uncased"))
            elif embed == "bert-large-cased":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(BertEmbeddings("bert-large-cased"))
            elif embed == "elmo-small":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("small"))
            elif embed == "elmo-medium":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("medium"))
            elif embed == "elmo-original":
                if datastore["embeddings"]["lang"] == "en":
                    embed_list.append(ELMoEmbeddings("original"))
            elif embed == "bert-base-chinese":
                if datastore["embeddings"]["lang"] == "zh":
                    embed_list.append(emb.BertEmbeddingsChinese("bert-base-chinese"))
            else:
                split_name = embed.split(".")
                ext = split_name[-1]
                kind = split_name[-2]

                if ext == "pt":  # Flair type

                    extra_index = 0
                    try:
                        extra_index = int(datastore["embeddings"]["extra_index"])
                    except:
                        pass

                    if kind == "char":
                        embed_list.append(emb.FlairEmbeddingsChar(embed, extra_index=extra_index))
                    elif kind == "char-seg":
                        embed_list.append(emb.FlairEmbeddingsWordLevelCharSeg(embed, extra_index=extra_index))

                if ext == "vec":  # Char type
                    if kind == "char-seg":
                        embed_list.append(emb.WordEmbeddingsVecCharSeg(embed))
                    elif kind == "char":
                        embed_list.append(emb.WordEmbeddingsVecFirst(embed))
                    elif kind == "word":
                        embed_list.append(emb.WordEmbeddingsVecWord(embed))
                    elif kind == "bichar":
                        embed_list.append(emb.WordEmbeddingsVecBichar(embed))
                if ext == "bin":
                    if kind == "word":
                        embed_list.append(emb.WordEmbeddingsBinWord(embed))
                    elif kind == "bichar":
                        embed_list.append(emb.WordEmbeddingsBinBichar(embed))

        try:
            if bool(datastore["embeddings"]["ner_embed"]) == True:
                print("Generate NER embeddings..")
                embed_list.append(
                    emb.nerEmbedding(
                        generateNerEmbFromTrain(
                            iobes_corpus.train, tag_dictionary.get_items()
                        )
                    )
                )
        except:
            pass
        try:
            if bool(datastore["embeddings"]["one_hot"]) == True:
                print("Generate one hot embeddings..")
                embed_list.append(emb.OneHotEmbeddings(iobes_corpus))
        except:
            pass
        try:
            if datastore["embeddings"]["embeddings_ngram_list"] != None:
                embed_list.append(
                    emb.WordEmbeddingsVecNGramList(
                        datastore["embeddings"]["embeddings_ngram_list"]
                    )
                )
        except:
            pass

        if len(word_char) == 1 and len(char_word) == 1:
            embed_list.append(emb.WordEmbeddingsVecWordChar(word_char[0], char_word[0]))

        embedding_types: List[TokenEmbeddings] = embed_list

        embeddings: emb.StackedEmbeddingsNew = emb.StackedEmbeddingsNew(
            embeddings=embedding_types
        )

        if bool(datastore["dataset_reader"]["only_train"]):
            score = []
            for i in range(len(all_corpus)):

                tagger: SequenceTagger = SequenceTagger(
                    hidden_size=int(datastore["model"]["hidden_size"]),
                    embeddings=embeddings,
                    tag_dictionary=tag_dictionary,
                    tag_type=tag_type,
                    use_crf=bool(datastore["model"]["use_crf"]),
                    dropout=float(datastore["model"]["dropout"]),
                    word_dropout=float(datastore["model"]["word_dropout"]),
                    locked_dropout=float(datastore["model"]["locked_dropout"]),
                    rnn_layers=int(datastore["model"]["rnn_layers"]),
                )

                folder = datastore["train_config"]["folder"] + "/" + str(i)
                best = Path(folder + "/checkpoint.pt")
                iobes_corpus = all_corpus[i]
                if not best.exists():
                    best = Path(folder + "/best-model.pt")

                if best.exists():
                    trainer = ModelTrainer.load_checkpoint(
                        tagger.load_checkpoint(best), iobes_corpus
                    )
                else:
                    trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus)

                # 7. start training

                result = trainer.train(
                    folder,
                    learning_rate=float(datastore["train_config"]["learning_rate"]),
                    anneal_factor=float(datastore["train_config"]["anneal_factor"]),
                    min_learning_rate=float(datastore["train_config"]["min_learning_rate"]),
                    mini_batch_size=int(datastore["train_config"]["batch_size"]),
                    max_epochs=int(datastore["train_config"]["epoch"]),
                    save_final_model=bool(datastore["train_config"]["save_final_model"]),
                    checkpoint=bool(datastore["train_config"]["checkpoint"]),
                    param_selection_mode=bool(
                        datastore["train_config"]["param_selection_mode"]
                    ),
                    patience=int(datastore["train_config"]["patience"]),
                    monitor_test=bool(datastore["train_config"]["monitor_test"]),
                    embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]),
                    shuffle=bool(datastore["train_config"]["shuffle"]),
                )

                plotter = Plotter()
                if bool(datastore["train_config"]["save_plot_training_curve"]):
                    curve = folder + "/loss.tsv"
                    plotter.plot_training_curves(curve)
                if bool(datastore["train_config"]["save_plot_weights"]):
                    weight = folder + "/weights.txt"
                    plotter.plot_weights(weight)

                score.append(result["test_score"])

            print(score, "  \n Moyenne : ", round(sum(score) / len(score), 2))


        else:

            tagger: SequenceTagger = SequenceTagger(
                hidden_size=int(datastore["model"]["hidden_size"]),
                embeddings=embeddings,
                tag_dictionary=tag_dictionary,
                tag_type=tag_type,
                use_crf=bool(datastore["model"]["use_crf"]),
                dropout=float(datastore["model"]["dropout"]),
                word_dropout=float(datastore["model"]["word_dropout"]),
                locked_dropout=float(datastore["model"]["locked_dropout"]),
                rnn_layers=int(datastore["model"]["rnn_layers"]),
            )

            folder = datastore["train_config"]["folder"]
            best = Path(folder + "/checkpoint.pt")
            if not best.exists():
                best = Path(folder + "/best-model.pt")

            if best.exists():
                trainer = ModelTrainer.load_checkpoint(
                    tagger.load_checkpoint(best), iobes_corpus
                )
            else:
                trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus)

            # 7. start training

            trainer.train(
                folder,
                learning_rate=float(datastore["train_config"]["learning_rate"]),
                anneal_factor=float(datastore["train_config"]["anneal_factor"]),
                min_learning_rate=float(datastore["train_config"]["min_learning_rate"]),
                mini_batch_size=int(datastore["train_config"]["batch_size"]),
                max_epochs=int(datastore["train_config"]["epoch"]),
                save_final_model=bool(datastore["train_config"]["save_final_model"]),
                checkpoint=bool(datastore["train_config"]["checkpoint"]),
                param_selection_mode=bool(
                    datastore["train_config"]["param_selection_mode"]
                ),
                patience=int(datastore["train_config"]["patience"]),
                monitor_test=bool(datastore["train_config"]["monitor_test"]),
                embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]),
                shuffle=bool(datastore["train_config"]["shuffle"]),
            )

            plotter = Plotter()
            if bool(datastore["train_config"]["save_plot_training_curve"]):
                curve = folder + "/loss.tsv"
                plotter.plot_training_curves(curve)
            if bool(datastore["train_config"]["save_plot_weights"]):
                weight = folder + "/weights.txt"
                plotter.plot_weights(weight)
Beispiel #13
0
        default=10,
        help="maximum number of epochs to train. Terminates training if this number is surpassed.",
    )
    parser.add_argument("--batch_size", type=int, default=32)

    args = parser.parse_args()

    assert not (
        args.word_embedding is None and args.flair_embedding is None and args.transformer_embedding is None
    ), f"At least 1 embedding needs to be specified"

    columns = {0: "text", 1: "deletion"}
    corpus = ColumnCorpus(args.data_dir, columns, train_file="train.txt", dev_file="dev.txt")

    target_tag = "deletion"
    tag_dict = corpus.make_tag_dictionary(tag_type=target_tag)

    embeddings = []
    if args.word_embedding is not None:
        embeddings.append(WordEmbeddings(args.word_embedding))
    if args.flair_embedding is not None:
        for direction in ["forward", "backward"]:
            embeddings.append(FlairEmbeddings(args.flair_embedding.replace("X", direction)))
    if args.transformer_embedding is not None:
        embeddings.append(TransformerWordEmbeddings(args.transformer_embedding, fine_tune=args.finetune, layers="-1"))

    embeddings = StackedEmbeddings(embeddings=embeddings)
    tagger = SequenceTagger(
        hidden_size=args.hidden_size,
        embeddings=embeddings,
        tag_dictionary=tag_dict,