def main_global(args): data_dir = args.data_dir params = {'batch_size': args.batch, 'shuffle': False} if args.bert_fts: type_dir = "all_bertemb/" else: type_dir = "all/" data_dir_back = "" if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'): if args.bert_fts: data_dir_back = args.data_dir + "all_backward_bertemb/" else: data_dir_back = args.data_dir + "all_backward/" train_data = EventDataset(args.data_dir + type_dir, "train", args.glove2vocab, data_dir_back, args.bert_fts) print('train_data: %s in total' % len(train_data)) train_generator = get_data_loader(train_data, **params) dev_data = EventDataset(args.data_dir + type_dir, "dev", args.glove2vocab, data_dir_back, args.bert_fts) print('dev_data: %s in total' % len(dev_data)) dev_generator = get_data_loader(dev_data, **params) if args.bert_fts: data_dir_back = args.data_dir + "all_backward_bertemb/" else: data_dir_back = args.data_dir + "all_backward/" test_data = EventDataset(args.data_dir + type_dir, "test", args.glove2vocab, data_dir_back, args.bert_fts) test_generator = get_data_loader(test_data, **params) s_time = time.time() models = [NNClassifier()] score = 0 for model in models: dev_f1 = model.train_epoch(train_generator, dev_generator, args) print('total time escape', time.time() - s_time) evaluator = Evaluator(model) #print(evaluator.evaluate(test_generator, args)) score = evaluator.get_score(test_generator, args) #evaluator.collect_result(test_generator, args) print('final test f1: %.4f' % (score)) return float(dev_f1), float(score)
def parallel_cv(self, split, emb=np.array([]), pos_emb=[], args=None): params = {'batch_size': args.batch, 'shuffle': False} if args.bert_fts: type_dir = "cv_bertemb" else: type_dir = "cv_shuffle" if args.cv_shuffle else 'cv' backward_dir = "" if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'): if args.bert_fts: backward_dir = "%scv_backward_bertemb/fold%s/" % ( args.data_dir, split) else: backward_dir = "%scv_backward/fold%s/" % (args.data_dir, split) train_data = EventDataset( args.data_dir + '%s/fold%s/' % (type_dir, split), "train", args.glove2vocab, backward_dir, args.bert_fts) train_generator = get_data_loader(train_data, **params) dev_data = EventDataset( args.data_dir + '%s/fold%s/' % (type_dir, split), "dev", args.glove2vocab, backward_dir, args.bert_fts) dev_generator = get_data_loader(dev_data, **params) seeds = [0, 10, 20] accumu_f1 = 0. accumu_epoch = 0. for seed in seeds: exec("args.%s=%s" % ('seed', seed)) f1, epoch = self._train(train_generator, dev_generator, emb, pos_emb, args, in_cv=True) accumu_f1 += f1 accumu_epoch += epoch avg_f1 = accumu_f1 / float(len(seeds)) avg_epoch = accumu_epoch / float(len(seeds)) return avg_f1, avg_epoch
def main_local(args): data_dir = args.data_dir params = {'batch_size': args.batch, 'shuffle': False} if args.bert_fts: type_dir = "all_bertemb/" else: type_dir = "all/" data_dir_back = "" if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'): if args.bert_fts: data_dir_back = args.data_dir + "all_backward_bertemb/" else: data_dir_back = args.data_dir + "all_backward/" train_data = EventDataset(args.data_dir + type_dir, "train", args.glove2vocab, data_dir_back, args.bert_fts) print('total train_data %s samples' % len(train_data)) train_generator = get_data_loader(train_data, **params) dev_data = EventDataset(args.data_dir + type_dir, "dev", args.glove2vocab, data_dir_back, args.bert_fts) print('total dev_data %s samples' % len(dev_data)) dev_generator = get_data_loader(dev_data, **params) if args.bert_fts: data_dir_back = args.data_dir + "all_backward_bertemb/" else: data_dir_back = args.data_dir + "all_backward/" test_data = EventDataset(args.data_dir + type_dir, "test", args.glove2vocab, data_dir_back, args.bert_fts) test_generator = get_data_loader(test_data, **params) models = [NNClassifier()] for model in models: dev_f1 = model.train_epoch(train_generator, dev_generator, args) evaluator = Evaluator(model) #evaluator.for_analysis(test_generator, args) score = evaluator.get_score(test_generator, args) return float(dev_f1), float(score)
def train_epoch(self, train_data, dev_data, args, test_data=None): if args.data_type == "matres": label_map = matres_label_map elif args.data_type == "tbd": label_map = tbd_label_map else: label_map = new_label_map all_labels = list(OrderedDict.fromkeys(label_map.values())) self._label_to_id = OrderedDict([(all_labels[l], l) for l in range(len(all_labels))]) self._id_to_label = OrderedDict([(l, all_labels[l]) for l in range(len(all_labels))]) args.label_to_id = self._label_to_id if args.joint: label_map_c = causal_label_map all_labels_c = list(OrderedDict.fromkeys(label_map_c.values())) self._label_to_id_c = OrderedDict([ (all_labels_c[l], l) for l in range(len(all_labels_c)) ]) self._id_to_label_c = OrderedDict([ (l, all_labels_c[l]) for l in range(len(all_labels_c)) ]) emb = args.emb_array np.random.seed(0) emb = np.vstack((np.random.uniform(0, 1, (2, emb.shape[1])), emb)) assert emb.shape[0] == len(args.glove2vocab) pos_emb = np.zeros((len(args.pos2idx) + 2, len(args.pos2idx) + 2)) for i in range(pos_emb.shape[0]): pos_emb[i, i] = 1.0 self.args = args selected_epoch = 20 if args.cv == True: best_params, avg_epoch = self.cross_validation( emb, pos_emb, copy.deepcopy(args)) for k, v in best_params.items(): exec("args.%s=%s" % (k, v)) if args.write: with open( 'best_param/global_cv_bestparam_' + str(args.data_type) + '_TrainOn' + str(args.trainon) + '_Teston' + str(args.teston) + '_uf' + str(args.usefeature) + '_trainpos' + str(args.train_pos_emb) + '_joint' + str(args.joint) + '_devbytrain' + str(args.devbytrain), 'w') as file: for k, v in vars(args).items(): if (k != 'emb_array') and (k != 'glove2vocab'): file.write(str(k) + ' ' + str(v) + '\n') selected_epoch = avg_epoch elif args.selectparam == True: best_params, best_epoch = self.selectparam(emb, pos_emb, args) for k, v in best_params.items(): exec("args.%s=%s" % (k, v)) if args.write: with open( 'best_param/global_selectDev_bestparam_' + str(args.data_type) + '_TrainOn' + str(args.trainon) + '_Teston' + str(args.teston) + '_uf' + str(args.usefeature) + '_trainpos' + str(args.train_pos_emb) + '_joint' + str(args.joint) + '_devbytrain' + str(args.devbytrain), 'w') as file: for k, v in vars(args).items(): if (k != 'emb_array') and (k != 'glove2vocab'): file.write(str(k) + ' ' + str(v) + '\n') selected_epoch = best_epoch if args.refit_all: exec('args.epochs=%s' % int(selected_epoch)) print('refit all.....') params = {'batch_size': args.batch, 'shuffle': False} if args.bert_fts: type_dir = "all_bertemb/" else: type_dir = 'all/' data_dir_back = "" if (args.trainon == 'bothway') or (args.trainon == 'bothWselect'): if args.bert_fts: data_dir_back = args.data_dir + "all_backward_bertemb/" else: data_dir_back = args.data_dir + "all_backward/" t_data = EventDataset(args.data_dir + type_dir, 'train', args.glove2vocab, data_dir_back, args.bert_fts) d_data = EventDataset(args.data_dir + type_dir, 'dev', args.glove2vocab, data_dir_back, args.bert_fts) t_data.merge_dataset(d_data) train_data = get_data_loader(t_data, **params) dev_data = [] best_f1, best_epoch = self._train(train_data, dev_data, emb, pos_emb, args) print("Final Epoch Use: %s" % best_epoch) print("Final Dev F1: %.4f" % best_f1) return best_f1
def main(): args = parse_args() level = logging.INFO if args.debug: level = logging.DEBUG logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", datefmt="%d-%m-%Y %H:%M:%S", level=level) # load cfg if os.path.exists(args.cfg): with open(args.cfg) as f: cfg = yaml.load(f, Loader=yaml.FullLoader) logging.debug(cfg) else: logging.error(f"Cannot find cfg file: {args.cfg}") return 0 # load ontology OntReader = OntologyReader(graph_file=os.path.join( os.path.dirname(args.cfg), cfg["graph"]), weighting_scheme=cfg["weighting_scheme"], leaf_node_weight=cfg["leaf_node_weight"]) # init torch device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): batch_size = torch.cuda.device_count() * args.batch_size else: batch_size = args.batch_size # build model and load checkpoint if cfg["model_type"] == "ontology": weights = OntReader.get_node_weights(cfg["redundancy_removal"]) num_classes = len(weights) else: # cfg["model_type"] == "classification" num_classes = OntReader.num_leafs if torch.cuda.device_count() == 0: logging.info(f"Test on CPU with batch_size {batch_size}") else: logging.info( f"Test on {torch.cuda.device_count()} GPU(s) with batch_size {batch_size}" ) model = ResNet50(num_classes=num_classes, model_type=cfg["model_type"], redundancy_removal=cfg["redundancy_removal"]) model.to(device) if torch.cuda.device_count() > 1: logging.info(f"Found {torch.cuda.device_count()} GPUs") model = nn.DataParallel(model) model.eval() model.load(device=device, path=os.path.join(os.path.dirname(args.cfg), cfg["model_checkpoint"])) # Init testing dataset infer_dataset = EventDataset(image_dir=args.image_dir, testset_path=args.testset) infer_dataloader = DataLoader(infer_dataset, batch_size=batch_size, num_workers=8) # predict event classes for images sample_predictions = get_sample_predictions( infer_dataloader=infer_dataloader, OntReader=OntReader, model=model, device=device, s2l_strategy=args.s2l_strategy) # calculate result for all nodes in the ontology logging.info("Calculate results ...") node_results = get_test_results(sample_predictions=sample_predictions, OntReader=OntReader) # print final results (global results are stored in the root node occurrence (Q1190554)) if "Q1190554" not in node_results: logging.warning("No results written ...") return 0 print_results(node_results["Q1190554"]["metrics"], node_results["Q1190554"]["num_test_images"]) # write results for each node if args.output: if not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) result_list = [] for val in node_results.values(): # calculate mean result for metric, result in val["metrics"].items(): val["metrics"][metric] = result / val["num_test_images"] result_list.append(val) result_list = sorted(result_list, key=lambda x: x["num_test_images"], reverse=True) with open(args.output, "w") as jsonfile: for result in result_list: jsonfile.write(json.dumps(result) + "\n") logging.info(f"Results written to: {args.output}") return 0