def initial(args, dimension, data_home, n_party, train_dimensions): if train_dimensions: if args.dimensions == "AVG": local_epoch = [1] * n_party else: local_epoch = [] for i in range(n_party): # 浮点数转为int, 同时随机每个client训练的local_epoch数 # local_epoch.append(np.random.randint(1, 10)) # 每一个client随机训练1-10次 local_epoch = [args.fix_epochs] * n_party # local_epoch = [4, 4, 3, 2, 1] print("local_epoch", local_epoch) # train_dimensions = [10,113] print("train_dimensions=", train_dimensions) train_dimensions_split = train_dimensions.copy( ) # 拷贝,防止后面get_dataset传引用而改变 # dataset print('data_home:', data_home) feature, label, index = get_dataset( args, n_party, data_home, train_dimensions_split) # 此处传参为引用类型 else: if args.dimensions == "AVG": last = dimension % n_party # 均分之后剩余的feature数 avg = dimension // n_party # 均分每个client分到的features数 train_dimensions = [avg] * n_party train_dimensions[n_party - 1] = avg + last local_epoch = [1] * n_party else: local_epoch = [] low = dimension // n_party - 10 # 随机每一个client上的feature数量,最少 high = dimension // n_party + 10 # 随机每一个client上的feature数量,最多 y0 = np.random.randint(low, high, size=n_party - 1) ratio = sum(y0) / dimension if n_party > 10: train_dimensions = y0 // ratio else: train_dimensions = y0 train_dimensions = train_dimensions.tolist() train_dimensions.append(dimension - sum(train_dimensions)) for i in range(n_party): # 浮点数转为int, 同时随机每个client训练的local_epoch数 train_dimensions[i] = int(train_dimensions[i]) local_epoch.append(np.random.randint(1, 10)) # 每一个client随机训练1-10次 # local_epoch = [] print("local_epoch", local_epoch) # train_dimensions = [10,113] print("train_dimensions=", train_dimensions) train_dimensions_split = train_dimensions.copy( ) # 拷贝,防止后面get_dataset传引用而改变 # dataset feature, label, index = get_dataset( args, n_party, data_home, train_dimensions_split) # 此处传参为引用类型 return local_epoch, feature, label, index
def annotation_parse(annotation_file, class_names): """ parse annotation file to get image dict and ground truth class dict Args: annotation_file: test annotation txt file class_names: list of class names Return: image dict would be like: annotation_records = { '/path/to/000001.jpg': {'100,120,200,235':'dog', '85,63,156,128':'car', ...}, ... } ground truth class dict would be like: classes_records = { 'car': [ ['000001.jpg','100,120,200,235'], ['000002.jpg','85,63,156,128'], ... ], ... } """ annotation_records = OrderedDict() classes_records = OrderedDict( {class_name: [] for class_name in class_names}) annotation_lines = get_dataset(annotation_file, shuffle=False) # annotation_lines would be like: # ['/path/to/000001.jpg 100,120,200,235,11 85,63,156,128,14', # ..., # ] for line in annotation_lines: box_records = {} image_name = line.split(' ')[0] boxes = line.split(' ')[1:] for box in boxes: # strip box coordinate and class class_name = class_names[int(box.split(',')[-1])] coordinate = ','.join(box.split(',')[:-1]) box_records[coordinate] = class_name # append or add ground truth class item record = [os.path.basename(image_name), coordinate] if class_name in classes_records: classes_records[class_name].append(record) else: classes_records[class_name] = list([record]) annotation_records[image_name] = box_records return annotation_records, classes_records
def build_dataloader(args, tokenizer, logger): """ Prepare the dataset for training and evaluation """ personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, logger) logger.info("Build inputs and labels") datasets = {"train": defaultdict(list), "dev": defaultdict(list)} for dataset_name, dataset in personachat.items(): num_candidates = len(dataset[0]["utterances"][0]["candidates"]) if args.num_candidates > 0: # and dataset_name == 'train': num_candidates = min(args.num_candidates, num_candidates) for dialog in tqdm(dataset): persona = dialog["personality"].copy() for utterance in dialog["utterances"]: history = utterance["history"][-(2*args.max_history+1):] # +1 as question img_list = utterance["img_list"] for j, candidate in enumerate(utterance["candidates"][-num_candidates:]): lm_labels = bool(j == num_candidates-1) instance = build_input_from_segments(persona, history, candidate, img_list, tokenizer, args, lm_labels) for input_name, input_array in instance.items(): datasets[dataset_name][input_name].append(input_array) datasets[dataset_name]["mc_labels"].append(num_candidates - 1) datasets[dataset_name]["n_candidates"] = num_candidates logger.info("Pad inputs and convert to Tensor") data = {} for dataset_name, dataset in datasets.items(): dataset = pad_dataset(dataset, logger, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) data[dataset_name] = dataset logger.info("Build train and validation dataloaders") train_dataset, valid_dataset = DialoImageDataset(data["train"], args.images_feature_path, "train"), DialoImageDataset(data["dev"], args.images_feature_path, "dev") train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=train_dataset.collate_fn, num_workers=args.num_workers, shuffle=(not args.distributed)) valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, collate_fn=valid_dataset.collate_fn, num_workers=args.num_workers, shuffle=False) logger.info("Train dataset (Batch, Seq length): {}".format(np.array(train_dataset.dataset["input_ids"]).shape)) logger.info("Valid dataset (Batch, Seq length): {}".format(np.array(valid_dataset.dataset["input_ids"]).shape)) return train_loader, valid_loader, train_sampler, valid_sampler
log_value('test_variance', variance, epoch) log_value('test_unique_policies', len(policy_set), epoch) # save the model --- agent agent_state_dict = agent.module.state_dict( ) if args.parallel else agent.state_dict() state = { 'agent': agent_state_dict, 'epoch': epoch, 'reward': reward, } torch.save(state, args.cv_dir + '/ckpt_E_%d_R_%.2E' % (epoch, reward)) #--------------------------------------------------------------------------------------------------------# trainset, testset = utils.get_dataset(args.img_size, args.data_dir) trainloader = torchdata.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) testloader = torchdata.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) agent = utils.get_model(num_actions) # ---- Load the pre-trained model ---------------------- start_epoch = 0 if args.load is not None: checkpoint = torch.load(args.load) agent.load_state_dict(checkpoint['agent'])
rnet_hr_state_dict = rnet_hr.module.state_dict( ) if args.parallel else rnet_hr.state_dict() state = { 'agent': agent_state_dict, 'resnet': rnet_hr_state_dict, 'epoch': epoch, 'reward': reward, 'acc': accuracy } torch.save( state, args.cv_dir + '/ckpt_E_%d_A_%.3f_R_%.2E' % (epoch, accuracy, reward)) #--------------------------------------------------------------------------------------------------------# trainset, testset = utils.get_dataset(args.model, args.data_dir) trainloader = torchdata.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=8) testloader = torchdata.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=8) rnet_hr, rnet_lr, agent = utils.get_model(args.model) rnet_hr.cuda() rnet_lr.eval().cuda() agent.cuda() # Save the configurations into the output folder configure(args.cv_dir + '/log', flush_secs=5)
def lr_scheduler(optim, iter): if iter < 10: optim.param_groups[0]['lr'] = args.lr/10 *iter elif iter > 30: optim.param_groups[0]['lr'] = args.lr*(30/iter) else: optim.param_groups[0]['lr'] = args.lr make_dir(args.log_path + 'unique_object/' + args.model_type + '/') make_dir(args.models_path + 'unique_object/' + args.model_type + '/') logger = SummaryWriter(args.log_path + 'unique_object/' + args.model_type + '/' + args.name) # logger.add_hparams(args.get_dict(), {}) trSet, valSet = get_dataset() net = get_net() if args.optimizer == 'Ranger': optimizer = Ranger(net.parameters(), lr=args.lr, alpha=0.5, k=5) elif args.optimizer == 'Adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr) else: optimizer = torch.optim.SGD(net.parameters(), lr=args.lr) trDataloader = DataLoader(trSet, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=trSet.collate_fn) valDataloader = DataLoader(valSet, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=valSet.collate_fn) # torch.autograd.set_detect_anomaly(True) iter_num = 0
def main_per_worker(process_index, ngpus_per_node, args): update_config(cfg, args) # torch seed torch.cuda.manual_seed(random.random()) # cudnn cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #proc_rank proc_rank = args.rank * ngpus_per_node + process_index #create logger logger, output_dir = create_logger(cfg, proc_rank) # logger.info(pprint.pformat(args)) # logger.info(cfg) model = get_model(cfg, cfg.MODEL.FILE, cfg.MODEL.NAME) optimizer = get_optimizer(cfg, model) model, optimizer, last_iter = load_checkpoint(cfg, model, optimizer) lr_scheduler = get_lr_scheduler(cfg, optimizer, last_iter) train_dataset, eval_dataset = get_dataset(cfg) # distribution if args.distributed: logger.info(f'Init process group: dist_url: {args.dist_url}, ' f'world_size: {args.world_size}, ' f'machine: {args.rank}, ' f'rank:{proc_rank}') dist.init_process_group(backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=proc_rank) torch.cuda.set_device(process_index) model.cuda() model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[process_index]) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) batch_size = cfg.DATASET.IMG_NUM_PER_GPU else: assert proc_rank == 0, ('proc_rank != 0, it will influence ' 'the evaluation procedure') model = torch.nn.DataParallel(model).cuda() train_sampler = None batch_size = cfg.DATASET.IMG_NUM_PER_GPU * ngpus_per_node print('BATCH_SIZE: ', batch_size) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), drop_last=True, collate_fn=objtrack_collect, num_workers=cfg.WORKERS, pin_memory=True, sampler=train_sampler) eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=objtrack_collect, num_workers=cfg.WORKERS) criterion = get_det_criterion(cfg) Trainer = get_trainer( cfg, model, optimizer, lr_scheduler, criterion, output_dir, last_iter, proc_rank, ) while True: Trainer.train(train_loader, eval_loader)
dest='yaml_file', default=None, help='experiment configure file name, e.g. configs/fcos_detector.yaml', type=str) parser.add_argument( 'opts', help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() if __name__ == '__main__': update_config(cfg, args) model = get_model(cfg, cfg.MODEL.FILE, cfg.MODEL.NAME) resume_path = cfg.MODEL.RESUME_PATH _, eval_dataset = get_dataset(cfg) eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=cfg.DATASET.IMG_NUM_PER_GPU, shuffle=False, drop_last=False, collate_fn=objtrack_collect, ) criterion = get_det_criterion(cfg) model = load_eval_model(resume_path, model) model.cuda() model.eval() mAP, aps, pr_curves = eval_fcos_det(cfg,
def main(): parser = get_parser() args = parser.parse_args() setup(args) # read train data train_set = utils.get_dataset(args.train_data_dir) nrof_classes = len(train_set) # read validation data print('unit test directory: %s' % args.unit_test_dir) unit_test_paths, unit_actual_issame = utils.get_val_paths( os.path.expanduser(args.unit_test_dir)) nrof_test_img = len(unit_test_paths) unit_issame_label = np.zeros(nrof_test_img) for i in range(len(unit_actual_issame)): unit_issame_label[2 * i] = unit_actual_issame[i] unit_issame_label[2 * i + 1] = unit_actual_issame[i] unit_issame_label = np.asarray(unit_issame_label, dtype=np.int32) # Get a list of image paths and their labels image_list, label_list = utils.get_image_paths_and_labels(train_set) assert len(image_list) > 0, 'The dataset should not be empty' print('Total number of train classes: %d' % nrof_classes) print('Total number of train examples: %d' % len(image_list)) print("number of validation examples: %d" % nrof_test_img) #ipdb.set_trace() train_dataset = data_loader.DataLoader(image_list, label_list, [160, 160], nrof_classes) validation_dataset = data_loader.DataLoader(unit_test_paths, unit_issame_label, [160, 160]) tf.reset_default_graph() if args.model_type == "student": teacher_model = None if args.load_teacher_from_checkpoint: teacher_model = model.BigModel(args, "teacher", nrof_classes, nrof_test_img) teacher_model.start_session() teacher_model.load_model_from_file( args.load_teacher_checkpoint_dir) print("Verify Teacher State before Training Student") teacher_model.run_inference(validation_dataset, unit_actual_issame) student_model = model.SmallModel(args, "student", nrof_classes, nrof_test_img) student_model.start_session() student_model.train(train_dataset, validation_dataset, unit_actual_issame, teacher_model) # Testing student model on the best model based on validation set student_model.load_model_from_file(args.checkpoint_dir) student_model.run_inference(validation_dataset, unit_actual_issame) if args.load_teacher_from_checkpoint: print("Verify Teacher State After Training student Model") teacher_model.run_inference(validation_dataset, unit_actual_issame) teacher_model.close_session() student_model.close_session() else: teacher_model = model.BigModel(args, "teacher", nrof_classes, nrof_test_img) teacher_model.start_session() teacher_model.train(train_dataset, validation_dataset, unit_actual_issame) # Testing teacher model on the best model based on validation set teacher_model.load_model_from_file(args.checkpoint_dir) teacher_model.run_inference(validation_dataset, unit_actual_issame) teacher_model.close_session()