def build_conll03en_corpus(base_path: str): document_as_sequence = False corpus = ColumnCorpus( base_path, column_format={0: "text", 1: "pos", 2: "np", 3: "ner"}, train_file="train.txt", dev_file="dev.txt", test_file="test.txt", tag_to_bioes="ner", document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) return corpus, tag_type, tag_dictionary
def train_ner(device_category): """ Training the sequence labeling model """ columns = {0: 'text', 1: 'ner'} training_file = os.path.join( root_path, 'part_extraction/data/{}.conll'.format(device_category)) data_folder = os.path.join(root_path, 'part_extraction/data') corpus = ColumnCorpus(data_folder, columns, train_file=training_file) print(len(corpus.train)) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) embedding_types = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings = StackedEmbeddings(embeddings=embedding_types) tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train(ner_models, learning_rate=0.1, mini_batch_size=32, max_epochs=150) trainer.model.save('{}/{}.pt'.format(ner_models, device_category))
def handle(self, *args, **options): file = options.get('file') or 'annotated_sentences' model_folder = options.get('model_folder') or 'model-var' columns = {0: 'text', 1: 'var'} data_folder = 'data/txt' corpus = ColumnCorpus(data_folder, columns, train_file=f'{file}.txt') tag_type = 'var' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), TransformerWordEmbeddings('bert-base-uncased'), ] embeddings = StackedEmbeddings(embeddings=embedding_types) tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer = ModelTrainer(tagger, corpus) trainer.train(f'data/models/taggers/{model_folder}', learning_rate=0.1, mini_batch_size=32, max_epochs=150) self.stdout.write(self.style.SUCCESS(f'Successfully trained model on dataset file.'))
class ner_trainer(object): def __init__(self, dataFolder, trainFile): self.tag_type = 'ner' self.dataFolder = dataFolder self.trainFile = trainFile self.columns = {0: 'text', 1: 'pos', 2: 'empty', 3: 'ner'} pass def LoadConll03(self, dataFolder, trainFile, testFile=None, devFile=None): self.corpus = ColumnCorpus(dataFolder, self.columns, trainFile, testFile, devFile) self.tag_dictionary = self.corpus.make_tag_dictionary( tag_type=self.tag_type) #print(self.corpus.train[0].to_tagged_string('ner')) print(self.tag_dictionary.idx2item) def LoadEmbeddings(self, emedding_types): self.embedding_types = emedding_types self.embeddings = StackedEmbeddings(embeddings=self.embedding_types) def train(self, model_path, learning_rate=0.1, batch_size=32, epochs=150, hidden_size=256, use_crf=True): self.tagger = SequenceTagger(hidden_size=hidden_size, embeddings=self.embeddings, tag_dictionary=self.tag_dictionary, tag_type=self.tag_type, use_crf=use_crf) self.trainer = ModelTrainer(self.tagger, self.corpus) self.trainer.train(model_path, learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=epochs)
def train(args, tag_type): ''' Training script to be run for training the ner model Parameters: ----------- args:arguments passed to the parser on CLI ''' data_dir = args.input_dir + '/data' corpus = ColumnCorpus(data_folder=data_dir, column_format={ 0: 'text', 1: 'ner' }, train_file=args.train_file, test_file=args.test_file, dev_file=args.dev_file) # print(corpus.train[0]) # print(corpus) # tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # print(tag_dictionary) if args.character_embeddings: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), CharacterEmbeddings(), FlairEmbeddings(args.flair_model_name_or_path_forward), FlairEmbeddings(args.flair_model_name_or_path_backward), ] else: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), FlairEmbeddings(args.flair_model_name_or_path_forward), FlairEmbeddings(args.flair_model_name_or_path_backward), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) if (args.train_or_predict == "continue_train"): print("continue training") checkpoint = '/Users/titashneogi/workspace/NLP/NER/data/flair/cumulative_model/checkpoint.pt' trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # start training trainer.train(args.model_dir, learning_rate=args.train_learning_rate, mini_batch_size=args.per_gpu_batch_size, max_epochs=args.num_train_epochs, embeddings_storage_mode=args.embeddings_storage_mode) model = SequenceTagger.load(args.model_dir + '/final-model.pt') if (args.predict_file): with open(data_dir + args.predict_file, 'r') as f: str_file = f.read() sentence = Sentence(str_file) model.predict(sentence) print(sentence.to_tagged_string())
def run_experiment(seed, batch_size, epoch, learning_rate, json_config): # Config values # Replace it with more Pythonic solutions later! hf_model = json_config["hf_model"] context_size = json_config["context_size"] layers = json_config["layers"] if "layers" in json_config else "-1" use_crf = json_config["use_crf"] if "use_crf" in json_config else False task_name = json_config["task_name"] # Dataset-related data_folder = json_config["data_folder"] train_file = json_config["train_file"] dev_file = json_config["dev_file"] test_file = json_config["test_file"] # Set seed for reproducibility set_seed(seed) if context_size == 0: context_size = False print("FLERT Context:", context_size) print("Layers:", layers) print("Use CRF:", use_crf) # Configuration column_format = {0: "text", 1: "ner"} # Corpus corpus = ColumnCorpus(data_folder=data_folder, column_format=column_format, train_file=train_file, dev_file=dev_file, test_file=test_file, tag_to_bioes="ner", ) # Corpus configuration tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # Embeddings embeddings = TransformerWordEmbeddings( model=hf_model, layers=layers, subtoken_pooling="first", fine_tune=True, use_context=context_size, ) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=use_crf, use_rnn=False, reproject_embeddings=False, ) # Trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.fine_tune( f"histo-flert-fine-tuning-{task_name}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}", learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=epoch, shuffle=True, embeddings_storage_mode='none', weight_decay=0., use_final_model_for_eval=False, )
class NER_experiment: def __init__(self, dataset_name, output_path="/kaggle/working"): input_path = f"/kaggle/input/{dataset_name}" self.tag_type = "ner" self.corpus = ColumnCorpus( data_folder=input_path, column_format={0: "text", 1: "ner"} ) self.tag_dictionary = self.corpus.make_tag_dictionary(tag_type=self.tag_type) self.dataset_name = dataset_name self.output_path = output_path def build_embedding(self, lang, embedding_codes: List[str]) -> None: self.tic = time.time() self.embedding_name: str = "-".join(embedding_codes) self.lang = lang embedding_types: List[TokenEmbeddings] = [] for code in embedding_codes: code = code.lower() assert code in [ "bpe", "bert", "flair", "ft", "char", "ohe", "elmo", ], f"{code} - Invalid embedding code" if code == "ohe": embedding_types.append(OneHotEmbeddings(corpus=self.corpus)) elif code == "ft": embedding_types.append(WordEmbeddings(self.lang)) elif code == "bpe": embedding_types.append(BytePairEmbeddings(self.lang)) elif code == "bert": embedding_types.append( TransformerWordEmbeddings( model=self.huggingface_ref[self.lang], pooling_operation="first", layers="-1", fine_tune=False, ) ) elif code == "char": embedding_types.append(CharacterEmbeddings()) elif code == "flair": embedding_types.append(FlairEmbeddings(f"{self.lang}-forward")) embedding_types.append(FlairEmbeddings(f"{self.lang}-backward")) elif code == "elmo": embedding_types.append( ELMoEmbeddings(model="large", embedding_mode="all") ) self.embedding: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types ) self.tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=self.embedding, tag_dictionary=self.tag_dictionary, tag_type=self.tag_type, use_crf=True, ) self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus) def train(self, max_epochs=100, storage_mode="cpu") -> None: self.trainer.train( base_path=self.output_path, learning_rate=0.1, mini_batch_size=32, max_epochs=max_epochs, embeddings_storage_mode=storage_mode, ) def output_results(self) -> None: # plot training history self.history = pd.read_csv(f"{self.output_path}/loss.tsv", sep="\t") self._plot_history() dest = self.output_path + "/results.csv" out = [ self.dataset_name, self.embedding_name, self.lang, self.history.EPOCH.iloc[-1], round(time.time() - self.tic), ] out.extend([round(x, 4) for x in self._extract_from_log().values()]) out = [str(x) for x in out] out_names = [ "dataset", "embedding", "lang", "epochs", "duration", "precision", "recall", "accuracy", "f1", ] # write header to file if not os.path.exists(dest): with open(dest, "w") as f: print(",".join(out_names), file=f) # write results to file with open(dest, "a") as f: print(",".join(out), file=f) def _extract_from_log(self) -> Dict[str, int]: log: List[str] = open(self.output_path + "/training.log", "r").readlines()[ -5:-1 ] out = {m: [] for m in ["precision", "recall", "accuracy", "f1"]} weights = [] for entity_class in log: label, *result_str = entity_class.split() results = [float(result_str[i]) for i in range(1, 23, 3)] weights.append( np.sum(results[:4]) ) # tp, fp, fn, tn (ignore) --> count instances/class for value, k in zip(results[4:], out): # pr, rec, acc, f1 out[k].append(value) # micro-average precision, recall, acc (ignore), F1 return {m: np.average(v, weights=weights) for m, v in out.items()} def _plot_history(self) -> None: fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(9, 8)) self.history.plot( "EPOCH", ["TRAIN_LOSS", "DEV_LOSS", "LEARNING_RATE"], ax=axes[0] ) self.history.plot( "EPOCH", ["DEV_RECALL", "DEV_PRECISION", "DEV_F1"], ax=axes[1] ) fig.savefig(f"{self.output_path}/{self.dataset_name}_{self.embedding_name}.png") # wrapper def run(self, lang, emb, max_epochs, storage): self.build_embedding(lang, emb) self.train(max_epochs, storage) self.output_results() # https://huggingface.co/models references huggingface_ref = { "fr": "camembert-base", "nl": "wietsedv/bert-base-dutch-cased", "en": "bert-base-cased", "multi": "bert-base-multilingual-cased", }
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings, CharacterEmbeddings, FlairEmbeddings from typing import List from flair.models import SequenceTagger from flair.trainers import ModelTrainer columns = {0: "text", 1: "pos", 2: "ner"} data_folder = "data/causenet/" corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", test_file="test.txt", dev_file="val.txt") tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types = [ CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, dropout=0.25, rnn_layers=4,
f"The training corpus contains {len(corpus.train)} (pretty long) sample sentences." ) print( f"The validation corpus contains {len(corpus.dev)} (pretty long) sample sentences." ) print( f"The testing corpus contains {len(corpus.test)} (pretty long) sample sentences." ) hidden_size = 128 embeddings = StackedEmbeddings(embeddings=[ WordEmbeddings("es"), FlairEmbeddings("es-forward"), FlairEmbeddings("es-backward") ]) # TransformerWordEmbeddings() dictionary = corpus.make_tag_dictionary(task) tagger = SequenceTagger(hidden_size, embeddings, dictionary, task) trainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.AdamW) trainer.train( base_path=f"taggers/{task}-stacked", train_with_dev=False, max_epochs=30, learning_rate=0.001, mini_batch_size=32, weight_decay=0., embeddings_storage_mode="none", scheduler=OneCycleLR, ) # trainer = ModelTrainer(tagger, corpus)
def run_experiment(data_folder, task_name, model_name, run_id, use_context): # Set seed for reproducibility set_seed(int(run_id)) if use_context == 0: use_context = False print("FLERT Context:", use_context) if task_name in ["lft", "onb"]: # Configuration column_format = {0: "token", 1: "ner"} # We use official data from Riedl and Padó train_file = f"enp_DE.{task_name}.mr.tok.train.bio" dev_file = f"enp_DE.{task_name}.mr.tok.dev.bio" test_file = f"enp_DE.{task_name}.mr.tok.test.bio" # Corpus corpus = ColumnCorpus( data_folder=data_folder, column_format=column_format, train_file=train_file, dev_file=dev_file, test_file=test_file, tag_to_bioes="ner", ) # Corpus configuration tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # Embeddings embedding_types: List[TokenEmbeddings] = [ TransformerWordEmbeddings(model=model_name, layers="all", layer_mean=True, use_context=use_context) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) # Trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( f"resources/taggers/ner-{task_name}-{model_name}-context{use_context}-{run_id}", learning_rate=0.1, mini_batch_size=16, max_epochs=200, shuffle=True, )
def run_experiments(input_dir: Path, output_dir: Path): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus = ColumnCorpus( input_dir, {0: 'text', 1: 'dep', 2: 'aspect'}, train_file='Laptops_poria-train.conll', # train_file='Restaurants_poria-train.conll', test_file='Laptops_poria-test.conll', # test_file='Restaurants_poria-test.conll', dev_file='Laptops_poria-train.conll' # dev_file='Restaurants_poria-train.conll' ) # 2. what tag do we want to predict? tag_type = 'aspect' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) all_embedding_to_test = { # 'glove+aspects': [ # WordEmbeddings('glove'), # WordEmbeddings( # (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix() # ), # ], # 'glove': [ # WordEmbeddings('glove'), # ], # 'charlmembedding': [ # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), # ], # 'glove-simple-char': [ # WordEmbeddings('glove'), # CharacterEmbeddings(), # ], 'bert+aspects': [ BertEmbeddings('bert-large-cased'), WordEmbeddings( (output_dir / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.word2vec_format.bin').as_posix() ) ], 'bert': [ BertEmbeddings('bert-large-cased'), ], # 'elmo': [ # ELMoEmbeddings('original') # ] } for name, embeddings_to_stack in tqdm( all_embedding_to_test.items(), desc='Different embeddings stacked', total=len(all_embedding_to_test) ): results_folder = Path(DEFAULT_OUTPUT_PATH / f'sequence-tagging/aspects/laptops-{name}') embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings_to_stack) # 5. initialize sequence tagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train( results_folder.as_posix(), learning_rate=0.1, mini_batch_size=32, max_epochs=150 ) # 8. plot training curves (optional) plotter = Plotter() plotter.plot_training_curves(results_folder / 'loss.tsv') plotter.plot_weights(results_folder / 'weights.txt')
def train_all(self): config_file = open(self.config, "r") if self.config.split('.')[-1] == "yml": datastore = yaml.load(config_file) elif self.config.split('.')[-1] == "json": datastore = json.loads(config_file.read()) else: print("Need a json or yaml file as config") sys.exit(0) columns = { int(datastore["dataset_reader"]["position_text"]): "text", int(datastore["dataset_reader"]["position_ner"]): "ner", } # focus_on = datastore["dataset_reader"]["focus_on"] if bool(datastore["dataset_reader"]["only_train"]): all_corpus = [] log.info("Reading data from {}".format(datastore["dataset_reader"]["data_folder"])) all_corpus = ColumnCorpusTrain( datastore["dataset_reader"]["data_folder"], columns, train_file=datastore["dataset_reader"]["train_name"], ) tag_type = "ner" tag_dictionary = all_corpus[0].make_tag_dictionary(tag_type=tag_type) else: iobes_corpus = ColumnCorpus( datastore["dataset_reader"]["data_folder"], columns, train_file=datastore["dataset_reader"]["train_name"], dev_file=datastore["dataset_reader"]["dev_name"], test_file=datastore["dataset_reader"]["test_name"], ) tag_type = "ner" tag_dictionary = iobes_corpus.make_tag_dictionary(tag_type=tag_type) try: train_ratio = float(datastore["dataset_reader"]["train_ratio"]) iobes_corpus = Corpus(iobes_corpus.train[0:int(len(iobes_corpus.train) * train_ratio)], iobes_corpus.dev, iobes_corpus.test) log_ratio = "Using only ", str(train_ratio * 100), "% of the train dataset" log.info(log_ratio) except: pass embed_list = [] word_char = [] char_word = [] for embed in datastore["embeddings"]["embeddings_list"]: if embed == "bpe": embed_list.append(BytePairEmbeddings(datastore["embeddings"]["lang"])) elif embed == "fasttext": embed_list.append(WordEmbeddings(datastore["embeddings"]["lang"])) elif embed == "flair" and datastore["embeddings"]["lang"] == "en": embed_list.append(FlairEmbeddings("news-forward")) embed_list.append(FlairEmbeddings("news-backward")) elif embed == "bert-base-uncased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-base-uncased")) elif embed == "bert-base-cased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-base-cased")) elif embed == "bert-large-uncased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-large-uncased")) elif embed == "bert-large-cased": if datastore["embeddings"]["lang"] == "en": embed_list.append(BertEmbeddings("bert-large-cased")) elif embed == "elmo-small": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("small")) elif embed == "elmo-medium": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("medium")) elif embed == "elmo-original": if datastore["embeddings"]["lang"] == "en": embed_list.append(ELMoEmbeddings("original")) elif embed == "bert-base-chinese": if datastore["embeddings"]["lang"] == "zh": embed_list.append(emb.BertEmbeddingsChinese("bert-base-chinese")) else: split_name = embed.split(".") ext = split_name[-1] kind = split_name[-2] if ext == "pt": # Flair type extra_index = 0 try: extra_index = int(datastore["embeddings"]["extra_index"]) except: pass if kind == "char": embed_list.append(emb.FlairEmbeddingsChar(embed, extra_index=extra_index)) elif kind == "char-seg": embed_list.append(emb.FlairEmbeddingsWordLevelCharSeg(embed, extra_index=extra_index)) if ext == "vec": # Char type if kind == "char-seg": embed_list.append(emb.WordEmbeddingsVecCharSeg(embed)) elif kind == "char": embed_list.append(emb.WordEmbeddingsVecFirst(embed)) elif kind == "word": embed_list.append(emb.WordEmbeddingsVecWord(embed)) elif kind == "bichar": embed_list.append(emb.WordEmbeddingsVecBichar(embed)) if ext == "bin": if kind == "word": embed_list.append(emb.WordEmbeddingsBinWord(embed)) elif kind == "bichar": embed_list.append(emb.WordEmbeddingsBinBichar(embed)) try: if bool(datastore["embeddings"]["ner_embed"]) == True: print("Generate NER embeddings..") embed_list.append( emb.nerEmbedding( generateNerEmbFromTrain( iobes_corpus.train, tag_dictionary.get_items() ) ) ) except: pass try: if bool(datastore["embeddings"]["one_hot"]) == True: print("Generate one hot embeddings..") embed_list.append(emb.OneHotEmbeddings(iobes_corpus)) except: pass try: if datastore["embeddings"]["embeddings_ngram_list"] != None: embed_list.append( emb.WordEmbeddingsVecNGramList( datastore["embeddings"]["embeddings_ngram_list"] ) ) except: pass if len(word_char) == 1 and len(char_word) == 1: embed_list.append(emb.WordEmbeddingsVecWordChar(word_char[0], char_word[0])) embedding_types: List[TokenEmbeddings] = embed_list embeddings: emb.StackedEmbeddingsNew = emb.StackedEmbeddingsNew( embeddings=embedding_types ) if bool(datastore["dataset_reader"]["only_train"]): score = [] for i in range(len(all_corpus)): tagger: SequenceTagger = SequenceTagger( hidden_size=int(datastore["model"]["hidden_size"]), embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=bool(datastore["model"]["use_crf"]), dropout=float(datastore["model"]["dropout"]), word_dropout=float(datastore["model"]["word_dropout"]), locked_dropout=float(datastore["model"]["locked_dropout"]), rnn_layers=int(datastore["model"]["rnn_layers"]), ) folder = datastore["train_config"]["folder"] + "/" + str(i) best = Path(folder + "/checkpoint.pt") iobes_corpus = all_corpus[i] if not best.exists(): best = Path(folder + "/best-model.pt") if best.exists(): trainer = ModelTrainer.load_checkpoint( tagger.load_checkpoint(best), iobes_corpus ) else: trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus) # 7. start training result = trainer.train( folder, learning_rate=float(datastore["train_config"]["learning_rate"]), anneal_factor=float(datastore["train_config"]["anneal_factor"]), min_learning_rate=float(datastore["train_config"]["min_learning_rate"]), mini_batch_size=int(datastore["train_config"]["batch_size"]), max_epochs=int(datastore["train_config"]["epoch"]), save_final_model=bool(datastore["train_config"]["save_final_model"]), checkpoint=bool(datastore["train_config"]["checkpoint"]), param_selection_mode=bool( datastore["train_config"]["param_selection_mode"] ), patience=int(datastore["train_config"]["patience"]), monitor_test=bool(datastore["train_config"]["monitor_test"]), embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]), shuffle=bool(datastore["train_config"]["shuffle"]), ) plotter = Plotter() if bool(datastore["train_config"]["save_plot_training_curve"]): curve = folder + "/loss.tsv" plotter.plot_training_curves(curve) if bool(datastore["train_config"]["save_plot_weights"]): weight = folder + "/weights.txt" plotter.plot_weights(weight) score.append(result["test_score"]) print(score, " \n Moyenne : ", round(sum(score) / len(score), 2)) else: tagger: SequenceTagger = SequenceTagger( hidden_size=int(datastore["model"]["hidden_size"]), embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=bool(datastore["model"]["use_crf"]), dropout=float(datastore["model"]["dropout"]), word_dropout=float(datastore["model"]["word_dropout"]), locked_dropout=float(datastore["model"]["locked_dropout"]), rnn_layers=int(datastore["model"]["rnn_layers"]), ) folder = datastore["train_config"]["folder"] best = Path(folder + "/checkpoint.pt") if not best.exists(): best = Path(folder + "/best-model.pt") if best.exists(): trainer = ModelTrainer.load_checkpoint( tagger.load_checkpoint(best), iobes_corpus ) else: trainer: ModelTrainer = ModelTrainer(tagger, iobes_corpus) # 7. start training trainer.train( folder, learning_rate=float(datastore["train_config"]["learning_rate"]), anneal_factor=float(datastore["train_config"]["anneal_factor"]), min_learning_rate=float(datastore["train_config"]["min_learning_rate"]), mini_batch_size=int(datastore["train_config"]["batch_size"]), max_epochs=int(datastore["train_config"]["epoch"]), save_final_model=bool(datastore["train_config"]["save_final_model"]), checkpoint=bool(datastore["train_config"]["checkpoint"]), param_selection_mode=bool( datastore["train_config"]["param_selection_mode"] ), patience=int(datastore["train_config"]["patience"]), monitor_test=bool(datastore["train_config"]["monitor_test"]), embeddings_storage_mode=str(datastore["train_config"]["embeddings_storage_mode"]), shuffle=bool(datastore["train_config"]["shuffle"]), ) plotter = Plotter() if bool(datastore["train_config"]["save_plot_training_curve"]): curve = folder + "/loss.tsv" plotter.plot_training_curves(curve) if bool(datastore["train_config"]["save_plot_weights"]): weight = folder + "/weights.txt" plotter.plot_weights(weight)
default=10, help="maximum number of epochs to train. Terminates training if this number is surpassed.", ) parser.add_argument("--batch_size", type=int, default=32) args = parser.parse_args() assert not ( args.word_embedding is None and args.flair_embedding is None and args.transformer_embedding is None ), f"At least 1 embedding needs to be specified" columns = {0: "text", 1: "deletion"} corpus = ColumnCorpus(args.data_dir, columns, train_file="train.txt", dev_file="dev.txt") target_tag = "deletion" tag_dict = corpus.make_tag_dictionary(tag_type=target_tag) embeddings = [] if args.word_embedding is not None: embeddings.append(WordEmbeddings(args.word_embedding)) if args.flair_embedding is not None: for direction in ["forward", "backward"]: embeddings.append(FlairEmbeddings(args.flair_embedding.replace("X", direction))) if args.transformer_embedding is not None: embeddings.append(TransformerWordEmbeddings(args.transformer_embedding, fine_tune=args.finetune, layers="-1")) embeddings = StackedEmbeddings(embeddings=embeddings) tagger = SequenceTagger( hidden_size=args.hidden_size, embeddings=embeddings, tag_dictionary=tag_dict,