def train_net(solver_prototxt, roidb, output_dir, pretrained_model, max_iter, gpus): """ Training the network with multiple gpu :param solver_prototxt: the network prototxt :param roidb: the training roidb :param output_dir: the output directory to be used for saving the models :param pretrained_model: the pre-trained model for fine-tuning :param max_iter: maximum number of iterations for solver :param gpus: the GPU ids to be used for solving :return: """ # Initiate Caffe NCCL uid = caffe.NCCL.new_uid() caffe.init_log(0, True) caffe.log('Using devices %s' % str(gpus)) # Create a process per GPU procs = [] for rank in range(len(gpus)): p = Process(target=worker, args=(rank, uid, gpus, solver_prototxt, roidb, pretrained_model, max_iter, output_dir)) p.daemon = True p.start() procs.append(p) for p in procs: p.join() print('done solving!')
def train_net(solver_prototxt, roidb, output_dir, pretrained_model, max_iter, gpus): """ Training the network with multiple gpu :param solver_prototxt: the network prototxt :param roidb: the training roidb :param output_dir: the output directory to be used for saving the models :param pretrained_model: the pre-trained model for fine-tuning :param max_iter: maximum number of iterations for solver :param gpus: the GPU ids to be used for solving :return: """ # Initiate Caffe NCCL uid = caffe.NCCL.new_uid() caffe.init_log(0,True) caffe.log('Using devices %s' % str(gpus)) # Create a process per GPU procs = [] for rank in range(len(gpus)): p = Process(target=worker, args=(rank, uid, gpus, solver_prototxt, roidb, pretrained_model, max_iter, output_dir)) p.daemon = True p.start() procs.append(p) for p in procs: p.join() print('done solving!')
def train( solver, # solver proto definition snapshot, # solver snapshot to restore use_cpu, #whether use cpu gpus, # list of device ids timing=False, # show timing info for compute and communications ): caffe.init_log(0, True) caffe.log('Using devices %s' % str(gpus)) if use_cpu == True: p = Process(target=cpu_solve, args=(solver, snapshot, timing)) p.daemon = True p.start() p.join() else: # NCCL uses a uid to identify a session uid = caffe.NCCL.new_uid() procs = [] for rank in range(len(gpus)): p = Process(target=solve, args=(solver, snapshot, gpus, timing, uid, rank)) p.daemon = True p.start() procs.append(p) for p in procs: p.join()
def show_time(): s = '\n' for i in range(len(net.layers)): s += 'forw %3d %8s ' % (i, net._layer_names[i]) s += ': %.2f\n' % fprop[i].ms for i in range(len(net.layers) - 1, -1, -1): s += 'back %3d %8s ' % (i, net._layer_names[i]) s += ': %.2f\n' % bprop[i].ms s += 'solver total: %.2f\n' % total.ms caffe.log(s)
def train( solver, # solver proto definition pretrained_model, # pretrained model for initialization snapshot, # solver snapshot to restore gpus, # list of device ids timing=False, # show timing info for compute and communications ): # NCCL uses a uid to identify a session #caffe.log(str(gpus)) #caffe.init_log() caffe.log('Using devices %s' % str(gpus)) solve(solver, pretrained_model, snapshot, gpus, timing, 0)
def show_time(): if solver.iter % display == 0: s = '\n' for i in range(len(solver.net.layers)): s += 'forw %3d %8s ' % (i, solver.net._layer_names[i]) s += ': %.2f\n' % fprop[i].ms for i in range(len(solver.net.layers) - 1, -1, -1): s += 'back %3d %8s ' % (i, solver.net._layer_names[i]) s += ': %.2f\n' % bprop[i].ms s += 'solver total: %.2f\n' % total.ms s += 'allreduce: %.2f\n' % allrd.ms caffe.log(s)
def show_time(): if solver.iter % display == 0: s = '\n' for i in range(len(solver.net.layers)): s += 'forw %3d %8s ' % (i, solver.net._layer_names[i]) s += ': %.2f\n' % fprop[i].ms for i in range(len(solver.net.layers) - 1, -1, -1): s += 'back %3d %8s ' % (i, solver.net._layer_names[i]) s += ': %.2f\n' % bprop[i].ms s += 'solver total: %.2f\n' % total.ms s += 'allreduce: %.2f\n' % allrd.ms caffe.log(s)
def train_model_multi_gpu(solver_prototxt, pretrained_model, gpus, timing=False): uid = caffe.NCCL.new_uid() caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] for rank in range(len(gpus)): p = Process(target=solve, args=(solver_prototxt, pretrained_model, gpus, timing, uid, rank)) p.daemon = True p.start() procs.append(p) for p in procs: p.join()
def train_net_multi_gpu(solver_prototxt, roidb, output_dir, pretrained_model, max_iter, gpus): """Train a Fast R-CNN network.""" uid = caffe.NCCL.new_uid() caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] for rank in range(len(gpus)): p = Process(target=solve, args=(solver_prototxt, roidb, pretrained_model, gpus, uid, rank, output_dir, max_iter)) p.daemon = False p.start() procs.append(p) for p in procs: p.join()
def train( solver, # solver proto definition initialization, # weights or solver snapshot to restore from datasets, gpus # list of device ids ): # NCCL uses a uid to identify a session uid = caffe.NCCL.new_uid() # caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] for rank in range(len(gpus)): p = Process(target=solve, args=(solver, initialization, datasets, gpus, uid, rank)) p.daemon = True p.start() procs.append(p) for p in procs: p.join()
def train_net_multi_gpu(solver_prototxt, roidb, output_dir, pretrained_model, max_iters, gpus): """Train a Fast R-CNN network.""" uid = caffe.NCCL.new_uid() caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] queue = multiprocessing.Queue() queue.put({'path_list': []}) for rank in range(len(gpus)): p = Process(target=solve, args=(solver_prototxt, roidb, pretrained_model, gpus, uid, rank, output_dir, max_iters, queue)) p.daemon = False p.start() procs.append(p) for p in procs: p.join() path_list = queue.get()['path_list'] return path_list
def train( solver, # solver proto definition snapshot, # solver snapshot to restore gpus, # list of device ids timing=False, # show timing info for compute and communications ): # NCCL uses a uid to identify a session uid = caffe.NCCL.new_uid() caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] for rank in range(len(gpus)): p = Process(target=solve, args=(solver, snapshot, gpus, timing, uid, rank)) p.daemon = True p.start() procs.append(p) for p in procs: p.join()
def train( solver, # solver proto definition snapshot, # solver snapshot to restore weight, #caffemodel to load gpus, # list of device ids timing=False, # show timing info for compute and communications ): # NCCL uses a uid to identify a session uid = caffe.NCCL.new_uid() #caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] for rank in range(len(gpus)): p = Process(target=solve, args=(solver, snapshot, weight, gpus, timing, uid, rank)) p.daemon = True p.start() procs.append(p) for p in procs: p.join()
solver.net.copy_from(_weights) solver.net.layers[0].get_gpu_id(gpus[rank]) nccl = caffe.NCCL(solver, uid) nccl.bcast() solver.add_callback(nccl) if solver.param.layer_wise_reduce: solver.net.after_backward(nccl) for _ in range(max_iter): solver.step(1) if __name__ == '__main__': uid = caffe.NCCL.new_uid() caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] for rank in range(len(gpus)): p = Process(target=solve, args=(solver_prototxt, gpus, uid, rank, max_iter)) p.daemon = False p.start() procs.append(p) for p in procs: p.join()
assert cfg.TEST.HAS_RPN base_imgs = '../DATA_imgs/' folders = ['scene_img_abstract_v002_train2015', 'scene_img_abstract_v002_val2015'] for data_path in folders: image_ids = load_image_ids(base_imgs+data_path+'/') print(len(image_ids), 'len_image_ids') random.seed(10) random.shuffle(image_ids) # Split image ids between gpus image_ids = [image_ids[i::len(gpus)] for i in range(len(gpus))] caffe.init_log() caffe.log('Using devices %s' % str(gpus)) procs = [] for i,gpu_id in enumerate(gpus): outfile = base_p+'outputs/'+data_path+'-features.tsv' outfile = '%s.%d' % (outfile, gpu_id) p = Process(target=generate_tsv, args=(gpu_id, prototxt, caffemodel, image_ids[i], outfile)) p.daemon = True p.start() procs.append(p) for p in procs: p.join()
def beam_decode( model, # net proto definition vocab_file, # model vocab text file weights, # pretrained weights to use gpu, # device id outfile, # json output ): vocab = [] with open(vocab_file) as f: for word in f: vocab.append(word.strip()) print 'Loaded {:,} words into caption vocab'.format(len(vocab)) caffe.init_log(0, 1) caffe.log('Using device %s' % str(gpu)) caffe.set_device(int(gpu)) caffe.set_mode_gpu() net = caffe.Net(model, weights, caffe.TEST) print 'Loaded proto {} with weights {}'.format(model, weights) net.layers[0].load_dataset() id_to_caption = {} iteration = 0 while True: ending = False out = net.forward() image_ids = net.blobs['image_id'].data captions = net.blobs['caption'].data scores = net.blobs['log_prob'].data batch_size = image_ids.shape[0] if captions.shape[0] == batch_size: # Decoding a compact net beam_size = captions.shape[2] for n in range(batch_size): if iteration == 0: print "\nhttp://mscoco.org/explore/?id=%d" % image_ids[n][0] for b in range(beam_size): cap = translate(vocab, captions[n][0][b]) score = scores[n][0][b] if iteration == 0: print '[%d] %.2f %s' % (b, score, cap) else: # Decoding an unrolled net beam_size = captions.shape[0] / batch_size if iteration == 0: print "Beam size: %d" % beam_size for n in range(batch_size): image_id = int(image_ids[n][0]) if iteration == 0: print "\nhttp://mscoco.org/explore/?id=%d" % image_id for b in range(beam_size): cap = translate(vocab, captions[n * beam_size + b]) score = scores[n * beam_size + b] if b == 0: if image_id in id_to_caption: ending = True else: id_to_caption[image_id] = cap if iteration == 0: print '[%d] %.2f %s' % (b, score, cap) iteration += 1 if iteration % 1000 == 0: print 'Iteration: %d' % iteration if ending: break output = [] for image_id in sorted(id_to_caption.keys()): output.append({ 'image_id': image_id, 'caption': id_to_caption[image_id] }) with open(outfile, 'w') as f: json.dump(output, f) print 'Generated %d outputs, saving to %s' % (len(output), outfile) s = CaptionScorer() s.score(outfile)
def partseg_train(network, exp_dir, category, args): def solve2(solver, args, uid, rank): if args.cpu: caffe.set_mode_cpu() else: caffe.set_mode_gpu() caffe.set_device(args.gpus[rank]) caffe.set_solver_count(len(args.gpus)) caffe.set_solver_rank(rank) caffe.set_multiprocess(True) solver = caffe.get_solver(solver) if args.init_model: if args.init_model.endswith('.caffemodel'): solver.net.copy_from(args.init_model) else: solver.net.copy_from(os.path.join(exp_dir, '{}_iter_{}.caffemodel'.format(category, args.init_model))) if args.init_state: if args.init_state.endswith('.solverstate'): solver.restore(args.init_state) else: solver.restore(os.path.join(exp_dir, '{}_iter_{}.solverstate'.format(category, args.init_state))) nccl = caffe.NCCL(solver, uid) nccl.bcast() if solver.param.layer_wise_reduce: solver.net.after_backward(nccl) print(rank) #pdb.set_trace() solver.step(solver.param.max_iter) #solver.solve() #caffe.set_device(0) if network == 'seq': batch_norm = True conv_weight_filler = 'xavier' network = models.partseg_seq(arch_str=args.arch, skip_str=args.skips, dataset=args.dataset, dataset_params=args.dataset_params, category=category, feat_dims_str=args.feat, lattice_dims_str=args.lattice, sample_size=args.sample_size, batch_size=args.batch_size, batchnorm=batch_norm, conv_weight_filler=conv_weight_filler, save_path=os.path.join(exp_dir, category + '_net.prototxt')) models.partseg_seq(deploy=True, arch_str=args.arch, skip_str=args.skips, dataset=args.dataset, dataset_params=args.dataset_params, category=category, feat_dims_str=args.feat, lattice_dims_str=args.lattice, sample_size=args.sample_size, batchnorm=batch_norm, save_path=os.path.join(exp_dir, category + '_net_deploy.prototxt')) else: assert network.endswith('.prototxt'), 'Please provide a valid prototxt file' print('Using network defined at {}'.format(network)) random_seed = 0 debug_info = False solver = create_solver.standard_solver(network, network, os.path.join(exp_dir, category)+'_' +args.prefix, base_lr=args.base_lr, gamma=args.lr_decay, stepsize=args.stepsize, test_iter=args.test_iter, test_interval=args.test_interval, max_iter=args.num_iter, snapshot=args.snapshot_interval, solver_type=args.solver_type, weight_decay=args.weight_decay, iter_size=args.iter_size, debug_info=debug_info, random_seed=random_seed, save_path=os.path.join(exp_dir, category+'_solver.prototxt')) ## Multiple GPUs uid = caffe.NCCL.new_uid() caffe.init_log(0, True) caffe.log('Using devices %s' % str(args.gpus)) procs = [] for rank in range(len(args.gpus)): p = Process(target=solve2, args=(solver, args, uid, rank)) p.daemon = True p.start() procs.append(p) for p in procs: p.join()
total.stop() show_time() if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("net_file", help="network model proto definition.") parser.add_argument("-g", "--gpu", type=int, default=0, help="Gpu Id.") parser.add_argument("-i", "--iters", type=int, default=10, help="Number of test iterations") parser.add_argument("-d", "--dataset", help="ImageDataset JSON file") args = parser.parse_args() # caffe.init_log() caffe.log('Using GPU# %s' % str(args.gpu)) # init caffe caffe.set_device(args.gpu) caffe.set_mode_gpu() net = caffe.Net(args.net_file, caffe.TRAIN) if args.dataset is not None: print 'Loading dataset from {}'.format(args.dataset) dataset = rac.datasets.ImageDataset.from_json(args.dataset) print 'Loaded {} dataset with {} annotations'.format(dataset.name(), dataset.num_of_images()) net.layers[0].add_dataset(dataset) net.layers[0].generate_datum_ids() print 'Will now run Fwd and Bkwd for {} times'.format(args.iters)
force_boxes_json = json.load(open(_A.force_boxes))["annotations"] # Keep a map of image ID to force boxes. force_boxes_map = {} for annotation in force_boxes_json: if annotation["image_id"] not in force_boxes_map: force_boxes_map[annotation["image_id"]] = [annotation] else: force_boxes_map[annotation["image_id"]].append(annotation) # Make an H5 dataset to also store predicted classes if external boxes are provided. classes_dset = output_h5.create_dataset( "classes", (len(image_ids), ), h5py.special_dtype(vlen=np.uint32)) caffe.init_log() caffe.log("Using device {}".format(_A.gpu_id)) caffe.set_mode_gpu() caffe.set_device(_A.gpu_id) net = caffe.Net(_A.prototxt, caffe.TEST, weights=_A.caffemodel) for index, (image_id, image_file) in enumerate(tqdm(image_ids)): if _A.force_boxes is not None: # Get force_boxes if provided through args. force_boxes_annotations = force_boxes_map[image_id] force_boxes = np.asarray( [a["bbox"] for a in force_boxes_annotations], dtype=np.float32) else: force_boxes = None
# gpus = [int(i) for i in gpu_list] print('Using config:') pprint.pprint(cfg) assert cfg.TEST.HAS_RPN image_dict = load_image_dict(args.data_split) #import ipdb; ipdb.set_trace() # # Split image dictionary between gpus # image_dicts = [] # for x in range(len(gpus)): # image_dicts.append( dict(image_dict.items()[x::len(gpus)]) ) caffe.init_log() caffe.log('Using device %s' % str(gpu_id)) generate_h5(gpu_id, args.prototxt, args.caffemodel, image_dict, args.outfile) # 74 seconds, 48MB # Time required: ~ 17 days # Memory require: ~ 984 GB # procs = [] # for i,gpu_id in enumerate(gpus): # outfile = '%s.%d' % (args.outfile, gpu_id) # p = Process(target=generate_h5, # args=(gpu_id, args.prototxt, args.caffemodel, image_dicts[i], outfile)) # p.daemon = True # p.start() # procs.append(p) # for p in procs:
if rank == 0: logging.info('curr_iter: {}, step_iters: {}'.format( curr_iter, step_iters)) solver.snapshot() curr_iter += step_iters if __name__ == "__main__": solver_proto = 'models/multigpu/solver.prototxt' weights_file = 'data/imagenet_models/VGG16.v2.caffemodel' cfg_file = 'experiments/faster_rcnn_end2end.yml' gpus = [3, 4, 5, 6] # caffe caffe.init_log(0, True) caffe.log('Using device {}'.format(str(gpus))) uid = caffe.NCCL.new_uid() # cfg cfg_from_file(cfg_file) assert (cfg.TRAIN.HAS_RPN \ and cfg.TRAIN.BBOX_REG \ and cfg.TRAIN.BBOX_NORMALIZE_TARGETS \ and cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED) # roidb imdb_name = 'ftdata_train' imdb = get_imdb(imdb_name) if cfg.TRAIN.USE_FLIPPED: print 'Appending horizontally-flipped training examples...' imdb.append_flipped_images()