def evaluate(self): self.clf.eval() if self.config[MODEL_TYPE] > 1: self.coref_trainer.model.eval() with torch.no_grad(): eval = Evaluator(self, self.data_helper, self.config) eval.eval_parser(self.data_helper.val_trees)
def train_classifier(self, train_loader, dev_loader): """ """ self.optim = Optimizer(self.clf.parameters(), lr=self.config[LR]) if self.config[EPOCH_START] != 1: self.load('../data/model/' + self.config[MODEL_NAME] + "_" + str(self.config[EPOCH_START])) for epoch in range(1, self.config[NUM_EPOCHS] + 1): cost_acc = 0 self.clf.train() print("============ epoch: ", epoch, " ============") for i, data in enumerate(train_loader): docs, gold_actions = data cost_acc += self.sr_parse(docs, gold_actions, self.optim)[1] if (i % 50 == 0): print("Cost on step ", i, "is ", cost_acc) print("Total cost for epoch ", epoch, "is ", cost_acc) self.clf.eval() with torch.no_grad(): eval = Evaluator(self, self.clf.data_helper) eval.eval_parser(dev_loader, path=None) self.save('../data/model/', self.config[MODEL_NAME] + "_" + str(epoch), epoch)
def evaluate_potential_synonyms(self, empolis_mapping_path: str, distance_threshold: float = 0.85): """ Evaluates how well a classifier is able to predict synonyms for the entities of a dataset. This has to be done using a different evaluation method because it does not evaluate whether a classifier is able to predict a mention correctly but rather whether a classifier is able to predict all synonyms/mentions for the Empolis dataset that are known for an entity. Because this is a completely different kind of evaluation a separate method has been implemented. """ with open(empolis_mapping_path, 'r') as f: empolis_mapping_synonym_to_entity = json.load(f) # Remove all entries that are not part of the current split, because they can't be predicted by the # classifier and therefore should not be used for the evaluation synonyms_to_remove = [] for synonym, entity_data in empolis_mapping_synonym_to_entity.items(): if entity_data['entities'][0] not in self._entities: synonyms_to_remove.append(synonym) for synonym in synonyms_to_remove: del empolis_mapping_synonym_to_entity[synonym] # Classify all query samples res, identified_mentions = self._get_potential_synonyms( distance_threshold=distance_threshold) relevant_synonyms = {} for mention in identified_mentions: # Check if the mention is known to the Empolis dataset ground_truth_entity = empolis_mapping_synonym_to_entity.get( mention, None) if ground_truth_entity is not None: ground_truth_entity = ground_truth_entity['entities'][0] # If it is known, mark it accordingly for all mentions for the respective entity if ground_truth_entity not in relevant_synonyms: relevant_synonyms[ground_truth_entity] = set([mention]) else: relevant_synonyms[ground_truth_entity].update([mention]) else: # Remove all mentions that are not relevant for the evaluation (aka not known to the empolis data) for entity in res.keys(): if mention in res[entity]: del res[entity][mention] # Evaluation evaluator = Evaluator() _, macro, micro = evaluator.evaluate_empolis_synonyms( res, relevant_synonyms) # Note: the top1 accuracy is not relevant here because we have no top x evaluation here print("\nMacro metrics:" "\nPrecision: %.2f%%, Recall: %.2f%%, F1-Score: %.2f%%" % macro) print("\nMicro metrics:" "\nPrecision: %.2f%%, Recall: %.2f%%, F1-Score: %.2f%%" % micro)
def evaluate_datasplit(self, split: str, num_results: int = 1, eval_mode: str = 'mentions', empolis_mapping_path: str = None, empolis_distance_threshold: float = 0.85): """ Evaluate the given datasplit. split has to be one of the three: train, test, val. """ assert split in ['train', 'test', 'val' ], "The given evaluation split is not a valid split." assert split == self._loaded_datasplit, "The evaluation split has not been loaded." assert eval_mode in ['mentions', 'samples' ], "The evaluation mode is not a valid mode." start = datetime.datetime.now() empolis_mapping = None if empolis_mapping_path is not None: with open(empolis_mapping_path, 'r') as f: empolis_mapping = json.load(f) eval_results = {} for sample in self._query_data: sentence = sample['sentence'] mention = sample['mention'] suggestions = self._classify(mention, sentence=sentence, num_results=num_results) if mention == "[NIL]" and empolis_mapping is not None: eval_results = self._evaluate_empolis( suggestions, sample, empolis_mapping, eval_results, empolis_distance_threshold) elif mention != "[NIL]": eval_results = self._add_suggestion_to_eval_results( suggestions, sample, eval_results) end = datetime.datetime.now() print("Classification took: ", end - start) # Calculate some metrics evaluator = Evaluator() top1_accuracy, macro, micro = evaluator.evaluate( eval_results, eval_mode) print("\nTop1 Accuracy: %.2f%%" % top1_accuracy) print("\nMacro metrics:" "\nPrecision: %.2f%%, Recall: %.2f%%, F1-Score: %.2f%%" % macro) print("\nMicro metrics:" "\nPrecision: %.2f%%, Recall: %.2f%%, F1-Score: %.2f%%" % micro)
os.path.join(args.output_dir, args.parse_type, "TRAINING", "data_helper.bin")) if args.train: data_helper.load_data_helper( os.path.join(args.output_dir, args.parse_type, "TRAINING", "data_helper.bin")) data_helper.load_train_data(data_dir=args.data_dir, output_dir=args.output_dir, parse_type=args.parse_type, isFlat=args.isFlat) train_model(data_helper) if args.eval: # Evaluate models on the RST-DT test set if args.isFlat: evaluator = Evaluator(isFlat=args.isFlat, model_dir=os.path.join( args.output_dir, "RN~model")) else: evaluator = Evaluator(isFlat=args.isFlat, model_dir=os.path.join( args.output_dir, "N~model")) evaluator.eval_parser(data_dir=args.data_dir, output_dir=args.output_dir, report=True, bcvocab=brown_clusters, draw=False, isFlat=args.isFlat) if args.pred: if args.isFlat: evaluator = Evaluator(isFlat=args.isFlat,
train_dirname = (args.train_dir[:-1] if args.train_dir[-1] == os.sep else args.train_dir).split(os.sep)[-1] HELPER_PATH = f"..{os.sep}data{os.sep}{train_dirname}_data_helper_rst.bin" print("Helper path:", HELPER_PATH) if args.prepare: # Create training data #coref_model = CorefScore(higher_order=True).to(config[DEVICE]) coref_model = CorefScore().to(config[DEVICE]) coref_trainer = Trainer(coref_model, [], [], [], debug=False) data_helper.create_data_helper(args.train_dir, config, coref_trainer) data_helper.save_data_helper(HELPER_PATH) if args.train: train_model_coref(data_helper, config) if args.eval: # Evaluate models on the RST-DT test set data_helper.load_data_helper(HELPER_PATH) parser = get_discourse_parser(data_helper, config) parser.load('../data/model/' + config[MODEL_NAME]) print("Evaluating") with torch.no_grad(): evaluator = Evaluator(parser, data_helper, config) evaluator.eval_parser(None, path=args.eval_dir, use_parseval=args.use_parseval)
'train_on': 'patch', 'patch_size': 512, 'resize_width': 512, 'resize_height': 512, 'attention_blocks': False, 'guided_attention': False, 'attention_loss': 'dice', 'attention_weight': 10, 'apply_attention_mask': True, 'n_layers': 18, } if __name__ == '__main__': if len(sys.argv) < 2: config = Main.default_config() else: path = Path(sys.argv[1]) config = load_config(path) print("Using config:") print(f"\t{config}") main = Main(config) main.train() from eval.evaluation import Evaluator eval = Evaluator(Path(f"../output/{main.name}"), name=config['name']) eval.evaluate_model(main.model)
parser.add_argument('--eval_dir', help='eval data directory') return parser.parse_args() if __name__ == '__main__': args = parse_args() # Use brown clusters with gzip.open("../data/resources/bc3200.pickle.gz") as fin: print('Load Brown clusters for creating features ...') brown_clusters = pickle.load(fin) data_helper = DataHelper(max_action_feat_num=330000, max_relation_feat_num=300000, min_action_feat_occur=1, min_relation_feat_occur=1, brown_clusters=brown_clusters) if args.prepare: # Create training data data_helper.create_data_helper(data_dir=args.train_dir) data_helper.save_data_helper('../data/data_helper.bin') if args.train: data_helper.load_data_helper('../data/data_helper.bin') data_helper.load_train_data(data_dir=args.train_dir) train_model(data_helper) if args.eval: # Evaluate models on the RST-DT test set evaluator = Evaluator(model_dir='../data/model') evaluator.eval_parser(path=args.eval_dir, report=True, bcvocab=brown_clusters, draw=False)
from pathlib import Path import torch from eval.evaluation import Evaluator with torch.no_grad(): tests = [] tests.extend(Path(r'../output').glob('*')) for path in tests: if path.name == 'eval': continue name = path.name.split('--')[-1] eval = Evaluator(Path(path), name=name) # checkpoint = eval.find_best_model('test') checkpoint = eval.checkpoints[-1] # Last model eval.evaluate(checkpoint) eval.attention_map(checkpoint, 'per_abnormality') eval.attention_map(checkpoint, 'per_mammogram')