def test_run_ner(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu epochs = 7 if get_gpu_count() > 1 else 2 tmp_dir = self.get_auto_remove_tmp_dir() testargs = f""" run_ner.py --model_name_or_path bert-base-uncased --train_file tests/fixtures/tests_samples/conll/sample.json --validation_file tests/fixtures/tests_samples/conll/sample.json --output_dir {tmp_dir} --overwrite_output_dir --do_train --do_eval --warmup_steps=2 --learning_rate=2e-4 --per_device_train_batch_size=2 --per_device_eval_batch_size=2 --num_train_epochs={epochs} """.split() if torch_device != "cuda": testargs.append("--no_cuda") with patch.object(sys, "argv", testargs): run_ner.main() result = get_results(tmp_dir) self.assertGreaterEqual(result["eval_accuracy"], 0.75) self.assertGreaterEqual(result["eval_precision"], 0.75) self.assertLess(result["eval_loss"], 0.5)
def test_run_ner(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) tmp_dir = self.get_auto_remove_tmp_dir() testargs = f""" run_ner.py --model_name_or_path bert-base-uncased --train_file tests/fixtures/tests_samples/conll/sample.json --validation_file tests/fixtures/tests_samples/conll/sample.json --output_dir {tmp_dir} --overwrite_output_dir --do_train --do_eval --warmup_steps=2 --learning_rate=2e-4 --per_gpu_train_batch_size=2 --per_gpu_eval_batch_size=2 --num_train_epochs=2 """.split() if torch_device != "cuda": testargs.append("--no_cuda") with patch.object(sys, "argv", testargs): result = run_ner.main() self.assertGreaterEqual(result["eval_accuracy_score"], 0.75) self.assertGreaterEqual(result["eval_precision"], 0.75) self.assertLess(result["eval_loss"], 0.5)
def test_run_ner(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) testargs = """ --model_name distilbert-base-german-cased --output_dir ./examples/tests_samples/temp_dir --overwrite_output_dir --data_dir ./examples/tests_samples/GermEval --labels ./examples/tests_samples/GermEval/labels.txt --max_seq_length 128 --num_train_epochs 6 --logging_steps 1 --do_train --do_eval """.split() with patch.object(sys, "argv", ["run.py"] + testargs): result = run_ner.main() self.assertLess(result["loss"], 1.5)
os.path.join(args.dev, "dev.txt"), os.path.join(args.train, "dev.txt") ) # Copy (by hard link) the dev data to match with what run_ner.py expects. except FileExistsError: print(os.path.join(args.dev, "dev.txt"), " already exists; skip hard linking") label = ["--label", os.path.join(args.label, "label.txt") ] if args.label else [] model_dir = ("--output_dir", args.model_dir) # Recommended additional args: --model_type, --model_name_or_path, --num_train_epochs sys.argv = [*cmd_opts, *data_dir, *model_dir, *label, *train_args] print(sys.argv) run_ner.main() # After training, model artifacts go to model.tar.gz, while tensorboard data goes to output.tar.gz. # To recap, here's the directory structure after training: # # /opt/ml # ├── code # │ ├── [content of sourcedir.tar.gz] # │ └── runs # │ └── Feb27_08-45-03_49c1d0103e10 # │ └── events.out.tfevents.1582793103.49c1d0103e10.292.0 # ├── input # │ └── train # │ ├── cached_dev_bert-base-cased_128 # │ ├── cached_train_bert-base-cased_128 # │ ├── dev.txt
def main(json_config): working_dir = "CrossVal_Files/Rotation/Train_file_swap" train_set_directory = "CrossVal_Files/Rotation/Train" test_set_directory = "CrossVal_Files/Rotation/Test" test_swap_directory = "CrossVal_Files/Rotation/Test_file_swap" test_pred_directory = "CrossVal_Files/Rotation/Test_predictions" train_files = os.listdir(train_set_directory) test_files = os.listdir(test_set_directory) only_predict = False # assign training data data = array(train_files) kfold = KFold(10, False) # iterate through hyperparameter search """{ "per_gpu_batch_size": [16, 32], "learning_rate": [2e-5, 3e-5, 5e-5], "num_epochs": [2, 3, 4] weight decay? "weight_decay": (0, 0.3), warmup steps? "warmup_steps": (0, 500), }""" if not only_predict: count = 0 for train, dev in kfold.split(data): # only do it 5 times for now count += 1 if count > 5: print(f"Count reached {count}.\nTerminating...") break # Epoch count print("\n\n*=====*") print(f" COUNT {count}") print("*=====*\n\n") # Training print("\nTraining\n") # set config with open(json_config) as json_file: json_data = json.load(json_file) json_data["do_train"] = True json_data["do_eval"] = True json_data["do_predict"] = True print(f"\nSet Training to True...") with open(json_config, "w") as json_out: json.dump(json_data, json_out) # construct text files from file splits string_train_body = "" string_dev_body = "" for txt_file in data[train]: with open(f"{train_set_directory}/{txt_file}", "r", encoding="utf-8") as f: string_train_body = string_train_body + f.read() for txt_file in data[dev]: with open(f"{train_set_directory}/{txt_file}", "r", encoding="utf-8") as f: string_dev_body = string_dev_body + f.read() # write to files that are used in the training print("\nWriting new training files...") with open(f"{working_dir}/train.txt", "w", encoding="utf-8") as whole_train_file: whole_train_file.write(string_train_body) with open(f"{working_dir}/dev.txt", "w", encoding="utf-8") as whole_dev_file: whole_dev_file.write(string_dev_body) # Print how long the files are print(f"\nStatistics:\n" f"Length Training set: {len(string_train_body)}\n" f"Length Test set: {len(string_dev_body)}\n" f"Excerpt Test set:\n{string_dev_body[:100]}...") # Set seed in JSON print("\nWriting json config...") with open(json_config) as json_file: json_data = json.load(json_file) json_data["seed"] = random.randint(0, 200) print(f"\nCurrent Config:\n{json_data}\n") with open(json_config, "w") as json_out: json.dump(json_data, json_out) print("Running...\n") run_ner.main(json_config) print("\nTraining done.\n") print("!---DONE---!") elif only_predict: # Testing # Do documentwise prediction print("\n!---Testing---!\n") # prepare config for prediction with open(json_config) as json_file: json_data = json.load(json_file) json_data["do_train"] = False json_data["do_eval"] = False json_data["do_predict"] = True # json_data["model_name_or_path"] = model_path # load new trained model json_data["data_dir"] = test_swap_directory # set data dir to test directory with only one file in it print(f"\nModel Config:\n{json_data}\n") with open(json_config, "w") as json_out: json.dump(json_data, json_out) # iterate over directory with test documents (Rotation -> Test) print("Running...\n") for file in test_files: print("\n\n" f">>>>>>>{file}<<<<<<<<" f"\n\n") with open(os.path.join(test_set_directory, file), "r", encoding="utf-8") as read_file: test_string = read_file.read() with open(os.path.join(test_swap_directory, "test.txt"), "w+", encoding="utf-8") as write_file: write_file.write(test_string) result_string, pred_dict = run_ner.main(json_config) # construct dictionary entity_types = ["O", "Ort", "Datum", "Strafe_Gesamtfreiheitsstrafe_Dauer", "Strafe_Gesamtsatz_Dauer", "Strafe_Gesamtsatz_Betrag", "Strafe_Tatbestand_Paragraph", "Strafe_Tatbestand_Beschreibung", "Schadensbetrag_Beschreibung", "Schadensbetrag_Betrag", "Vorstrafe_nein", "Gestaendnis_ja", "straferhoehend_taeter", "strafmildernd_taeter", "Taeter_Drogenbezug_ja"] whole_dict = {key: {"original": [], "prediction": []} for key in entity_types} # populate original dict original_dict = {key: [] for key in entity_types} # Collect labels word_string = "" for previous, current in run_ner.pairwise(test_string.splitlines()): if not previous: continue elif not current: current = "O O" try: old_word, old_label = previous.split() new_word, new_label = current.split() except ValueError: print("Value Error") print(f"Couldn't unpack {file}") print("Continuing...") continue if old_label != "O": if old_label == new_label: word_string = word_string + " " + old_word if word_string else word_string + old_word else: word_string = word_string + " " + old_word if word_string else word_string + old_word original_dict[old_label].append(word_string) word_string = "" # put dict items in whole dictionary for key in whole_dict: whole_dict[key]["prediction"] = pred_dict[key] whole_dict[key]["original"] = original_dict[key] with open(os.path.join(test_pred_directory, file), "w+", encoding="utf-8") as write_file: write_file.write("===> " + file) write_file.write(result_string + "\n") # scan trough dict, make pretty and assign simple labels for key in whole_dict: write_file.write(f"{key}:\n") write_file.write(f"\tOriginal:\n") for item in whole_dict[key]['original']: write_file.write(f"\t \"{item}\"\n") write_file.write(f"\tPrediction:\n") for item in whole_dict[key]['prediction']: write_file.write(f"\t \"{item}\"\n") write_file.write("\n\n") print("\nPredictions done.\n") # let ner predict test file # take results and predictions and save it under file name in Test predictions # for each binary/document-wise label: # go through test document, collect labels # then go through predictions, collect labels # save and print for easy comparison # set config to eval & test (only do train before) # load trained model # iterate over directory with test documents (Rotation -> Test) # let eval & predict on every document # go through doc, fill a parameters dictionary (Gesamtfreiheitsstrafe, Drogen, etc.) with predictions # print parameters dictionary below evaluation, check against original documents by scanning through them # with simple regex patterns and writing it like "Original Doc: Drogenbezug_ja; Prediction: Drogenbezug_ja" # or simply with True, Fals or NaN for binary parameters and discrete values for numerical parameters print("!---DONE---!")