def solve(proto, snapshot, weight, gpus, timing, uid, rank):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if snapshot and len(snapshot) != 0:
        solver.restore(snapshot)

    elif (weight and len(weight) != 0):
        solver.net.copy_from(weight)

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    if timing and rank == 0:
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)
    solver.step(solver.param.max_iter)
def solve(proto, snapshot, gpus, uid, rank):

    print 'Loading solver to GPU: ' + str(rank)

    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if snapshot and len(snapshot) != 0:
        print 'Loading snapshot from : ' + snapshot + '  to GPU: ' + str(rank)
        #solver.restore(snapshot)
        solver.net.copy_from(snapshot)

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    if timing and rank == 0:
        print 'Timing ON'
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    if plotting and rank == 0:
        print 'Plotting ON'
        plot(solver, nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    print 'Starting solver for GPU: ' + str(rank)
    solver.step(solver.param.max_iter)
Esempio n. 3
0
def solve(proto, pretrained_model, gpus, timing, uid, rank):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solverW = SolverWrapper(proto, rank, pretrained_model)
    solver = solverW.getSolver()

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    print 'timing:', timing, rank, solver.param.layer_wise_reduce

    if timing and rank == 0:
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    cnt = 0
    while cnt < solver.param.max_iter:
        solver.step(1)
        print 'rank', rank, ' conv5_3:', solver.net.params[
            'conv_stage3_block2_branch2c'][0].data[0][0][0]
        cnt += 1
Esempio n. 4
0
def solve(proto, snapshot, gpus, timing, uid, rank):
    #<<<<<<< HEAD
    #    caffe.set_mode_gpu()
    #    caffe.set_device(gpus[rank])
    #=======
    caffe.set_device(gpus[rank])
    caffe.set_mode_gpu()
    #>>>>>>> 99bd99795dcdf0b1d3086a8d67ab1782a8a08383
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if snapshot and len(snapshot) != 0:
        solver.restore(snapshot)

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    if timing and rank == 0:
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)
    solver.step(solver.param.max_iter)
Esempio n. 5
0
def solve(proto, snapshot, weights, gpus, timing, uid, rank):

    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if snapshot and len(snapshot) != 0:
        solver.restore(snapshot)
    elif weights and len(weights) != 0:
        print('Loading pretrained model ' 'weights from {:s}').format(weights)
        solver.net.copy_from(weights)

    # For RCNNDataLayer, split the dataset across gpus
    if solver.net.layers[0].type == "Python":
        solver.net.layers[0].load_dataset(rank, len(gpus))

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    if timing and rank == 0:
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    solver.step(solver.param.max_iter)
def solve(proto, roidb, pretrained_model, gpus, uid, rank, output_dir, max_iter):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)
    cfg.GPU_ID = gpus[rank]

    solverW = SolverWrapper(proto, roidb, output_dir,rank,pretrained_model)
    solver = solverW.getSolver()
    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)
    count = 0
    timer = Timer()
    while count < max_iter:
        timer.tic()
        solver.step(1)
        timer.toc()
        count += 1
        # if count % (solver.param.display) == 0:
        if count % 200 == 0:
            if rank == 0:
                print 'iter: {}, speed: {:.3f}s / iter'.format(count, timer.average_time)
        if count % cfg.TRAIN.SNAPSHOT_ITERS == 0:
            if rank == 0:
                solverW.snapshot()
def solve(proto,
          roidb,
          pretrained_model,
          gpus,
          uid,
          rank,
          output_dir,
          max_iter,
          previous_state=None):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)
    cfg.GPU_ID = gpus[rank]

    solverW = SolverWrapper(proto, roidb, output_dir, rank, pretrained_model,
                            previous_state)
    solver = solverW.getSolver()
    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)
    count = 0
    while count < max_iter:
        solver.step(cfg.TRAIN.SNAPSHOT_ITERS)
        if rank == 0:
            solverW.snapshot()
        count = count + cfg.TRAIN.SNAPSHOT_ITERS
Esempio n. 8
0
def solve(proto, roidb, pretrained_model, gpus, uid, rank, output_dir,
          max_iter, reload):
    caffe.set_device(gpus[rank])
    caffe.set_mode_gpu()
    caffe.set_solver_count(len(gpus))

    caffe.set_solver_rank(rank)

    caffe.set_multiprocess(True)
    cfg.GPU_ID = gpus[rank]

    solverW = SolverWrapper(solver_prototxt=proto,
                            roidb=roidb,
                            output_dir=output_dir,
                            gpu_id=rank,
                            pretrained_model=pretrained_model,
                            reload=reload)
    solver = solverW.get_solver()
    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)
    while solver.iter < max_iter:
        solver.step(1)

        if solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0 and rank == 0:
            solverW.snapshot()
def solve(proto, snapshot, gpus, timing, uid, rank):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if snapshot and len(snapshot) != 0:
        solver.restore(snapshot)

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    if timing and rank == 0:
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    # if solver.param.layer_wise_reduce:
    #     solver.net.after_backward(nccl)
    while solver.iter < solver.param.max_iter:
        solver.step(100)
        sys.stderr.write("rank: {} iter: {}\n".format(rank, solver.iter))
        if rank == 1:
            sleep(1)
Esempio n. 10
0
def solve(gpus, uid, rank, solver_proto, roidb, weights=None, snapshot=None):
    cfg.GPU_ID = gpus[rank]
    # setting for current process
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(solver_proto)
    logging.info('uid: {}, rank: {}, layer_wise_reduce: {}'.format(
        uid, rank, solver.param.layer_wise_reduce))
    max_iter = solver.param.max_iter
    snapshot_iters = solver.param.snapshot
    if snapshot:
        solver.restore(snapshot)
    if weights:
        solver.net.copy_from(weights)
    solver.net.layers[0].set_roidb(roidb, rank)

    nccl = caffe.NCCL(solver, uid)
    solver.add_callback(nccl)
    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    nccl.bcast()
    curr_iter = solver.iter
    while curr_iter < max_iter:
        step_iters = snapshot_iters - curr_iter % snapshot_iters
        solver.step(step_iters)
        if rank == 0:
            logging.info('curr_iter: {}, step_iters: {}'.format(
                curr_iter, step_iters))
            solver.snapshot()
            curr_iter += step_iters
Esempio n. 11
0
    def solve2(solver, args, uid, rank):
        if args.cpu:
            caffe.set_mode_cpu()
        else:
            caffe.set_mode_gpu()
        caffe.set_device(args.gpus[rank])
        caffe.set_solver_count(len(args.gpus))
        caffe.set_solver_rank(rank)
        caffe.set_multiprocess(True)
        
        solver = caffe.get_solver(solver)

        if args.init_model:
            if args.init_model.endswith('.caffemodel'):
                solver.net.copy_from(args.init_model)
            else:
                solver.net.copy_from(os.path.join(exp_dir, '{}_iter_{}.caffemodel'.format(category, args.init_model)))

        if args.init_state:
            if args.init_state.endswith('.solverstate'):
                solver.restore(args.init_state)
            else:
                solver.restore(os.path.join(exp_dir, '{}_iter_{}.solverstate'.format(category, args.init_state)))

        nccl = caffe.NCCL(solver, uid)
        nccl.bcast()
        if solver.param.layer_wise_reduce:
            solver.net.after_backward(nccl)
        print(rank)
        #pdb.set_trace()
        solver.step(solver.param.max_iter)
Esempio n. 12
0
def worker(rank, uid, gpus, solver_prototxt, roidb, pretrained_model, max_iter,
           output_dir):
    """
    Training worker
    :param rank: The process rank
    :param uid: The caffe NCCL uid
    :param solver_proto: Solver prototxt
    :param roidb: Training roidb
    :param pretrained_model: Pretrained model
    :param gpus: GPUs to be used for training
    :param max_iter: Maximum number of training iterations
    :param output_dir: Output directory used for saving models
    :return:
    """

    # Setup caffe
    caffe.set_device(gpus[rank])
    caffe.set_mode_gpu()
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)
    cfg.GPU_ID = gpus[rank]

    # Setup Solver
    solverW = SolverWrapper(solver_prototxt=solver_prototxt,
                            roidb=roidb,
                            output_dir=output_dir,
                            gpu_id=rank,
                            pretrained_model=pretrained_model)
    solver = solverW.get_solver()
    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    # Train the model for the specified number of iterations
    while solver.iter < max_iter:
        solver.step(1)
        #A = solver.net.blobs['conv4_3']
        #print(A.data.shape)
        #[(k, v.data.shape) for k, v in solver.net.blobs.items()]
        if (solver.iter % cfg.TRAIN.SNAPSHOT == 0
                or solver.iter == max_iter - 1) and rank == 0:
            # Snapshot only in the main process
            solverW.snapshot()
Esempio n. 13
0
def solve(proto, roidb, pretrained_model, gpus, uid, rank, output_dir,
          max_iter):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)
    cfg.GPU_ID = gpus[rank]

    solverW = SolverWrapper(proto, roidb, output_dir, rank, pretrained_model)
    solver = solverW.getSolver()
    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)
    net = solver.net
    count = 0
    rpn_loss_cls = 0
    rpn_loss_bbox = 0
    frcn_loss_cls = 0
    frcn_loss_bbox = 0
    accuarcy = 0
    timer = Timer()
    display_step = 500
    while count < max_iter:
        timer.tic()
        solver.step(cfg.TRAIN.SNAPSHOT_ITERS / display_step)
        timer.toc()
        rpn_loss_cls = net.blobs['rpn_cls_loss'].data
        rpn_loss_bbox = net.blobs['rpn_loss_bbox'].data
        frcn_loss_cls = net.blobs['loss_cls'].data
        frcn_loss_bbox = net.blobs['loss_bbox'].data
        accuarcy = net.blobs['accuarcy'].data
        if solver.iter % (cfg.TRAIN.SNAPSHOT_ITERS / display_step) == 0:
            print 'speed: {:.3f}s / iter'.format(
                timer.average_time / (cfg.TRAIN.SNAPSHOT_ITERS / display_step))
            print 'rpn_loss_cls:' + str(
                rpn_loss_cls) + ',rpn_loss_bbox:' + str(
                    rpn_loss_bbox) + ',frcn_loss_cls:' + str(
                        frcn_loss_cls) + ',frcn_loss_bbox:' + str(
                            frcn_loss_bbox) + ',accuarcy' + str(accuarcy)
        if (rank == 0) and (solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0):
            solverW.snapshot()
        count = count + cfg.TRAIN.SNAPSHOT_ITERS
    def __init__(self,
                 solver_prototxt,
                 roidb,
                 output_dir,
                 nccl_uid,
                 rank,
                 bbox_means=None,
                 bbox_stds=None,
                 pretrained_model=None):
        """Initialize the SolverWrapper."""
        self.output_dir = output_dir
        self.rank = rank
        if cfg.TRAIN.BBOX_REG:
            self.bbox_means, self.bbox_stds = bbox_means, bbox_stds
        if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG
                and cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
            # RPN can only use precomputed normalization because there are no
            # fixed statistics to compute a priori
            assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED

        self.solver = caffe.SGDSolver(solver_prototxt)

        assert caffe.solver_count() * cfg.TRAIN.IMS_PER_BATCH * self.solver.param.iter_size == \
            cfg.TRAIN.REAL_BATCH_SIZE, "{} vs {}". \
            format(caffe.solver_count() * cfg.TRAIN.IMS_PER_BATCH * self.solver.param.iter_size, cfg.TRAIN.REAL_BATCH_SIZE)

        if pretrained_model is not None:
            print('Loading pretrained model '
                  'weights from {:s}').format(pretrained_model)
            self.solver.net.copy_from(pretrained_model)

        nccl = caffe.NCCL(self.solver, nccl_uid)
        nccl.bcast()
        self.solver.add_callback(nccl)
        assert self.solver.param.layer_wise_reduce
        if self.solver.param.layer_wise_reduce:
            self.solver.net.after_backward(nccl)
        self.nccl = nccl  # hold the reference to nccl

        self.solver_param = caffe_pb2.SolverParameter()
        with open(solver_prototxt, 'rt') as f:
            pb2.text_format.Merge(f.read(), self.solver_param)

        self.solver.net.layers[0].set_roidb(roidb)
Esempio n. 15
0
def solve(proto, initialization, datasets, gpus, uid, rank):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)

    if initialization is not None:
        assert osp.exists(
            initialization
        ), 'Path to weights/solverstate does not exist: {}'.format(
            initialization)
        if initialization.endswith('.solverstate'):
            print 'Restoring solverstate from {}'.format(initialization)
            solver.restore(initialization)
        elif initialization.endswith('.caffemodel'):
            print 'Initializing weights from {}'.format(initialization)
            solver.net.copy_from(initialization)
        else:
            raise ValueError(
                'ERROR: {} is not supported for initailization'.format(
                    initialization))
    else:
        warnings.warn(
            "Warning: No initialization provided. Training from scratch.")

    for dataset in datasets:
        solver.net.layers[0].add_dataset(dataset)
    solver.net.layers[0].print_params()
    solver.net.layers[0].generate_datum_ids()

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)
    solver.step(solver.param.max_iter)
Esempio n. 16
0
    def start(self, rank):
        self.rank = rank

        if len(self.gpus) > 0:
            self.device = self.gpus[rank]
            if debug:
                s = 'solver gpu %d' % self.gpus[self.rank] + \
                    ' pid %d' % os.getpid() + ' size %d' % self.size + \
                    ' rank %d' % self.rank
                print(s, file = sys.stderr)
            caffe.set_mode_gpu()
            caffe.set_device(self.device)
            caffe.set_solver_count(self.size)
            caffe.set_solver_rank(self.rank)
            caffe.set_multiprocess(True)
        else:
            print('solver cpu', file = sys.stderr)
            caffe.set_mode_cpu()

        if self.cmd.graph.endswith('.json'):
            with open(self.cmd.graph, mode = 'r') as f:
                graph = caffe_pb2.SolverParameter()
                text_format.Merge(f.read(), graph)
                self.graph = graph
        else:
            self.graph = self.solver_graph()

        import tempfile
        with tempfile.NamedTemporaryFile(mode = 'w+', delete = False) as f:
            text_format.PrintMessage(self.graph, f)
            tmp = f.name
        self.caffe = caffe.AdamSolver(tmp)

        if self.uid:
            self.nccl = caffe.NCCL(self.caffe, self.uid)
            self.nccl.bcast()
            self.caffe.add_callback(self.nccl)
            if self.caffe.param.layer_wise_reduce:
                self.caffe.net.after_backward(self.nccl)
Esempio n. 17
0
def solve_step(proto, snapshot, gpus, timing, uid, rank):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if snapshot and len(snapshot) != 0:
        solver.restore(snapshot)

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    if timing and rank == 0:
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    niter = solver.param.max_iter
    display = solver.param.display
    test_iter = 950
    test_interval = 200
    # 初始化
    train_loss = zeros(int(ceil(niter // display)))
    test_loss = zeros(int(ceil(niter // test_interval)))
    test_acc = zeros(int(ceil(niter // test_interval)))
    # 辅助变量
    _train_loss = 0;
    _test_loss = 0;
    _accuracy = 0;
    _max_accuracy = 0;
    _max_accuracy_iter = 0;
    # 进行解算
    for it in range(niter):
        solver.step(1)
def solve(proto, gpus, uid, rank, max_iter):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if rank == 0:
        # solver.restore(_snapshot)
        solver.net.copy_from(_weights)

    solver.net.layers[0].get_gpu_id(gpus[rank])

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    for _ in range(max_iter):
        solver.step(1)
Esempio n. 19
0
def worker(rank, uid, gpus, solver_prototxt, roidb, pretrained_model, max_iter,
           output_dir):
    """
    Training worker
    :param rank: The process rank
    :param uid: The caffe NCCL uid
    :param solver_proto: Solver prototxt
    :param roidb: Training roidb
    :param pretrained_model: Pretrained model
    :param gpus: GPUs to be used for training
    :param max_iter: Maximum number of training iterations
    :param output_dir: Output directory used for saving models
    :return:
    """

    # Setup caffe
    cfg.RANK = rank
    cfg.GPU_ID = gpus[rank]  # Will be used in gpu_nms
    caffe.set_device(cfg.GPU_ID)
    caffe.set_random_seed(cfg.RNG_SEED + rank)
    caffe.set_mode_gpu()
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    # Setup Solver
    solverW = SolverWrapper(
        solver_prototxt=str(solver_prototxt),
        roidb=roidb,
        output_dir=str(output_dir),
        rank=rank,
        pretrained_model=str(pretrained_model))
    solver = solverW.get_solver()
    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()
    solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)

    # Train the model for the specified number of iterations
    target_layers = filter(lambda x: x.startswith('target_layer'),
                           solver.net.layer_dict.keys())

    if rank == 0:
        t = Timer()

    while solver.iter < max_iter:
        for n in target_layers:
            solver.net.layer_dict[n].set_iter(solver.iter)
        if rank == 0:
            t.tic()
        solver.step(1)
        if (solver.iter % cfg.TRAIN.SNAPSHOT == 0
                or solver.iter == max_iter) and rank == 0:
            # Snapshot only in the main process
            solverW.snapshot(solver.iter == max_iter)
        if rank == 0:
            t.toc()
            eta_in_s = int((max_iter - solver.iter) * t.average_time)
            try:
                for loss_name, loss_val in solver.net.blobs.items():
                    if 'loss' not in loss_name:
                        continue
                    tb.sess.add_scalar_value(
                        loss_name, float(loss_val.data), step=solver.iter)
                for n in target_layers:
                    tb.sess.add_scalar_value(
                        n + '_accuracy',
                        float(solver.net.layer_dict[n].accuracy),
                        step=solver.iter)
                tb.sess.add_scalar_value(
                    "speed", 1. / t.average_time, step=solver.iter)
                tb.sess.add_scalar_value(
                    "ETA (min)", eta_in_s / 60., step=solver.iter)
            except:
                logger.warning('Failed to submit data to Tensorboard')
            sys.stdout.write('\r{}, Speed: {:5f} iter/sec, ETA: {:8s}'.format(
                ', '.join([
                    '{}: {:5f}'.format(i[0], i[1].data)
                    for i in solver.net.blobs.items() if 'loss' in i[0]
                ] + [
                    '{}: {:5f}'.format(
                        n +
                        '_accuracy', float(solver.net.layer_dict[n].accuracy))
                    for n in target_layers
                ]), 1. / t.average_time,
                str(datetime.timedelta(seconds=eta_in_s))))
            sys.stdout.flush()
Esempio n. 20
0
def solve_step(proto, snapshot, gpus, timing, uid, rank):
    caffe.set_mode_gpu()
    caffe.set_device(gpus[rank])
    caffe.set_solver_count(len(gpus))
    caffe.set_solver_rank(rank)
    caffe.set_multiprocess(True)

    solver = caffe.SGDSolver(proto)
    if snapshot and len(snapshot) != 0:
        solver.restore(snapshot)

    nccl = caffe.NCCL(solver, uid)
    nccl.bcast()

    if timing and rank == 0:
        time(solver, nccl)
    else:
        solver.add_callback(nccl)

    if solver.param.layer_wise_reduce:
        solver.net.after_backward(nccl)


    #solver = caffe.SGDSolver('/home/zhujiagang/temporal-segment-networks/models/ucf101/gating_three_solver.prototxt')
    #solver.restore('/home/zhujiagang/temporal-segment-networks/models/ucf101_split_1_gating_three_iter_200.solverstate')
    # 等价于solver文件中的max_iter,即最大解算次数
    niter = solver.param.max_iter
    display = solver.param.display
    test_iter = 950
    test_interval = 200
    # 初始化
    train_loss = zeros(int(ceil(niter // display)))
    test_loss = zeros(int(ceil(niter // test_interval)))
    test_acc = zeros(int(ceil(niter // test_interval)))
    # 辅助变量
    _train_loss = 0;
    _test_loss = 0;
    _accuracy = 0;
    _max_accuracy = 0;
    _max_accuracy_iter = 0;
    # 进行解算
    for it in range(niter):
        solver.step(1)
        _train_loss += solver.net.blobs['rgb_flow_gating_loss'].data
        if it % display == 0:
            train_loss[it // display] = _train_loss / display
            _train_loss = 0

        if it % test_interval == 0:
            print '\n my test, train iteration', it
            for test_it in range(test_iter):
                #print '\n my test, test iteration \n', test_it
                solver.test_nets[0].forward()
                _test_loss += solver.test_nets[0].blobs['rgb_flow_gating_loss'].data
                _accuracy += solver.test_nets[0].blobs['rgb_flow_gating_accuracy'].data
            test_loss[it / test_interval] = _test_loss / test_iter
            test_acc[it / test_interval] = _accuracy / test_iter
            if _max_accuracy < test_acc[it / test_interval]:
                _max_accuracy = test_acc[it / test_interval]
                _max_accuracy_iter = it
                solver.net.save('/home/zhujiagang/temporal-segment-networks/models/ucf101_split_1_gating_three_iter_' + str(it) + '.caffemodel')
                print '\nnewly max: _max_accuracy and _max_accuracy_iter', _max_accuracy, _max_accuracy_iter
            print '\n_max_accuracy and _max_accuracy_iter', _max_accuracy, _max_accuracy_iter
            _test_loss = 0
            _accuracy = 0

    print '\nplot the train loss and test accuracy\n'
    print '\n_max_accuracy and _max_accuracy_iter\n', _max_accuracy, _max_accuracy_iter

    _, ax1 = plt.subplots()
    ax2 = ax1.twinx()

    # train loss -> 绿色
    ax1.plot(display * arange(len(train_loss)), train_loss, 'g')
    # test loss -> 黄色
    ax1.plot(test_interval * arange(len(test_loss)), test_loss, 'y')
    # test accuracy -> 红色
    ax2.plot(test_interval * arange(len(test_acc)), test_acc, 'r')

    ax1.set_xlabel('iteration')
    ax1.set_ylabel('loss')
    ax2.set_ylabel('accuracy')
    plt.show()
def caffe_loop(gpus, uid, rank, avg_guys, proc_comm):
    """Main loop for each GPU process. 
    
    At the bottom is the main process which creates each GPU process (this guy). We set up all the parameters here and 
    then run the Caffe loop. NCCL links each GPU process implicitly. So, you will not see semaphores or other similars, 
    but NCCL is doing this in the background when Caffe is called. So for example, all processes will sync up when 
    Caffe step is called (in PrefetchTrain). 
    """
    global MP_COND
    global TRAIN_LOOP
    global FINISH_EXIT

    # Where is this project located?
    project_home = '/home/mundhenk/selfsupervised/'
    # Path to training image set
    path_prefix = '/home/mundhenk/images/patches_84h_110x110_13x13-blur-ab_compact/'

    # Condition is a label used for graphing, display purposes and saving snap shots
    # This can be any valid string, but must by file name friendly.
    condition = 'my_awesome_selfsupervised_run'
    # Base for where a lot of files are kept or go such as network files
    caffe_data_dir = project_home + '/caffe_data/'
    # Where to save figures
    fig_root = project_home + '/figures/'
    # where to save this project
    proj_snapshot_dir = project_home + '/py_proj/'
    # where to save moab files
    log_dir = project_home + '/moab_output/'
    # extra profile to run to set enviroment on node
    profile = project_home + '/scripts/profile.sh'
    # Your caffe network prototxt file
    network_file_name = caffe_data_dir + '/train_val_AlexNet-Custom_triple.prototxt'

    # Name of a caffemodel to use to initialize our weights from
    weight_file = ''

    # Alexnet layer names from the network prototxt file
    start_layer_vis = 'conv1'  # Visualize This layer
    softmax_layer = 'softmax_plain'  # For testing, we need this guy
    loss_layer = 'loss'  # Your loss layer
    # Are we using a batch normalized network schedule. For plain CaffeNet, set to False
    use_batch_norm_sched = True
    # Re-init project files?
    init_new = False

    # ImageNet mean gray
    image_mean = [104.0, 117.0, 123.0]  # ImageNET
    # Given a 110x110 size patch, what are the range of scales we can resize it to before cropping out 96x96?
    ra_max_size = 128  # Goes to a max size corresponding to an image of 448x448
    ra_min_size = 96  # Goes to a min size corresponding to an image of 171x171
    # Training batch size. The script will auto resize this when using more than one GPU
    train_batch_size = 128
    # Testing batch size.
    test_batch_size = 20
    # How many classes you will test over.
    bin_num = 20
    # The actual size of the patchs (96x96)
    patch_size = 96
    # Tells us where to center crop during testing
    patch_marg_1 = 7
    patch_marg_2 = 110
    # How many iters should we wait to display info?
    display_iters = 20
    # How many iters should we wait to test the network
    test_iters = 5000
    # Smoothing parameter over displayed loss
    loss_lambda = 20
    # Stride over the testing data set so we only use a subset.
    test_skip = 199
    # How often to snapshot the solver state
    snaphot_interval = 5000

    # training and testing list files
    test_list_file = path_prefix + 'val/val_list.nfl.npz'
    train_list_file = path_prefix + 'train/train_list.nfl.npz'

    # *******************************************************************************************************************
    # *******************************************************************************************************************
    # Dont edit after here
    # *******************************************************************************************************************
    # *******************************************************************************************************************

    # check to make sure files and dirs exist
    if PrefetchTrain.check_file(train_list_file, rank) == 0: return
    if PrefetchTrain.check_file(test_list_file, rank) == 0: return
    if PrefetchTrain.check_file(profile, rank) == 0: return

    if PrefetchTrain.check_dir(path_prefix, rank) == 0: return
    if PrefetchTrain.check_dir(project_home, rank) == 0: return
    if PrefetchTrain.check_dir(caffe_data_dir, rank) == 0: return

    # Create some directories if needed
    PrefetchTrain.check_create_dir(log_dir, rank)
    PrefetchTrain.check_create_dir(fig_root, rank)

    solver_file_name, snapshot_file, do_exit = PrefetchTrain.instantiate_slurm(
        proj_snapshot_dir,
        network_file_name,
        condition,
        log_dir,
        profile,
        snaphot_interval,
        use_batch_norm_sched,
        rank,
        MP_COND,
        proc_comm,
        init_new=init_new)

    # We just init-ed the whole thing. Now we exit
    if do_exit:
        return

    fig_model = condition
    fig_name_err = fig_root + fig_model + '.err.png'
    fig_name_sqr = fig_root + fig_model + '.sqr.jpg'
    fig_prop = 'b--'
    '''
    We will now configure a bunch of things before we run the main loop. NCCL needs some things to be in a
    particular order. Some tasks are reserved for a single process alone. These always run on the first GPU
    in the list.
    '''

    batch_toggle = 0

    print('GPU:{} Set Caffe Device'.format(gpus[rank]))

    print('GPU:{} Set Device'.format(gpus[rank]))
    caffe.set_device(
        gpus[rank])  ### THIS ALWAYS HAS TO COME BEFORE OTHER CAFFE SETTERS!!!

    # Set up multi processing
    if uid:
        print('GPU:{} Set Solver Count to {}'.format(gpus[rank], len(gpus)))
        caffe.set_solver_count(len(gpus))
        print('GPU:{} Set Solver Rank to {}'.format(gpus[rank], rank))
        caffe.set_solver_rank(rank)
        print('GPU:{} Set Multiprocess'.format(gpus[rank]))
        caffe.set_multiprocess(True)

    # Use GPU like a civilized human being
    print('GPU:{} Set to Use GPU'.format(gpus[rank]))
    caffe.set_mode_gpu()

    # resize the training batch size by number of GPU's we are using
    train_batch_size /= len(gpus)

    print('GPU:{} New Train Batch Size {}'.format(gpus[rank],
                                                  train_batch_size))

    print('GPU:{} Load Network and Files'.format(gpus[rank]))
    print("GPU:{} Solver: {}".format(gpus[rank], solver_file_name))

    # Create the Caffe solver and read the solver file so we can use some of its parameters
    solver = caffe.SGDSolver(solver_file_name)
    solver_params = PrefetchTrain.read_proto_solver_file(solver_file_name)
    max_iters = solver_params.max_iter

    print("GPU:{} Adjusted Batch Size For Each GPU : {}".format(
        gpus[rank], train_batch_size))

    # This script does not support iters.
    assert (solver_params.iter_size < 2)

    # Open our training and testing lists, but don't do anything with them yet.
    print("GPU:{} Loading: {}".format(gpus[rank], test_list_file))
    if rank == 0:
        test_list_in = open(test_list_file)
    print("GPU:{} Loading: {}".format(gpus[rank], train_list_file))
    train_list_in = open(train_list_file)

    # Do we have a weight file? If so, use it.
    if weight_file != '':
        print('GPU:{} Loading weight file: {} '.format(gpus[rank],
                                                       weight_file))
        solver.net.copy_from(weight_file)

    # Do we have a snapshot file? If so, use it.
    if snapshot_file != '':
        print('GPU:{} Loading Snapshot file: {}'.format(
            gpus[rank], snapshot_file))
        solver.restore(snapshot_file)

    if uid:
        # Create NCCL callback
        nccl = caffe.NCCL(solver, uid)
        nccl.bcast()
        solver.add_callback(nccl)

        if solver.param.layer_wise_reduce:
            solver.net.after_backward(nccl)

    print("GPU:{} Network and Files Loaded".format(gpus[rank]))

    # reshape our training blobs
    solver.net.blobs['data_1'].reshape(train_batch_size, 3, patch_size,
                                       patch_size)
    solver.net.blobs['data_2'].reshape(train_batch_size, 3, patch_size,
                                       patch_size)
    solver.net.blobs['data_3'].reshape(train_batch_size, 3, patch_size,
                                       patch_size)
    solver.net.blobs['label'].reshape(train_batch_size, 1, 1, 1)

    print("GPU:{} Network Train Blobs Set".format(gpus[rank]))

    # reshape testing blobs, but only process will do this.
    if rank == 0:
        solver.test_nets[0].blobs['data_1'].reshape(test_batch_size, 3,
                                                    patch_size, patch_size)
        solver.test_nets[0].blobs['data_2'].reshape(test_batch_size, 3,
                                                    patch_size, patch_size)
        solver.test_nets[0].blobs['data_3'].reshape(test_batch_size, 3,
                                                    patch_size, patch_size)
        solver.test_nets[0].blobs['label'].reshape(test_batch_size, 1, 1, 1)

        print("GPU:{} Network Test Blobs Set".format(gpus[rank]))

        test_transformer_1 = caffe.io.Transformer(
            {'data_1': solver.test_nets[0].blobs['data_1'].data.shape})
        test_transformer_1.set_transpose('data_1', (2, 0, 1))
        test_transformer_1.set_mean('data_1',
                                    np.float32(image_mean))  # mean pixel
        test_transformer_2 = caffe.io.Transformer(
            {'data_2': solver.test_nets[0].blobs['data_2'].data.shape})
        test_transformer_2.set_transpose('data_2', (2, 0, 1))
        test_transformer_2.set_mean('data_2',
                                    np.float32(image_mean))  # mean pixel
        test_transformer_3 = caffe.io.Transformer(
            {'data_3': solver.test_nets[0].blobs['data_3'].data.shape})
        test_transformer_3.set_transpose('data_3', (2, 0, 1))
        test_transformer_3.set_mean('data_3',
                                    np.float32(image_mean))  # mean pixel

        print("GPU:{} Network Test Transformer Set".format(gpus[rank]))

    # Set up our training parameters object
    tp = PrefetchTrain.TrainParams(solver, patch_size, patch_marg_1,
                                   patch_marg_2, train_batch_size,
                                   test_batch_size, bin_num, image_mean,
                                   loss_layer, softmax_layer)

    # copy a few more items over into our training parameters object
    tp.path_prefix = path_prefix
    tp.test_skip = test_skip
    tp.test_iters = test_iters
    tp.ra_patch_size = patch_size
    tp.ra_max_size = ra_max_size
    tp.ra_min_size = ra_min_size

    # Process and load our training data set
    print("GPU:{} Parse nfl context train list".format(gpus[rank]))
    NFL = NumpyFileList.CompactList()
    NFL.load(train_list_in)
    train_image_file = NFL

    train_list_in.close()

    # process and load our testing data set. Only one GPU will do this.
    if rank == 0:
        print("GPU:{} Parse nfl context test list".format(gpus[rank]))
        NFL = NumpyFileList.CompactList()
        NFL.load(test_list_in)
        test_image_file = NFL

        test_list_in.close()

    print("GPU:{} Lists Parsed".format(gpus[rank]))

    # Once we launch the threads, we need to exit gently
    TRAIN_LOOP = True

    # Init the two main loader threads and return handles
    f, r = PrefetchTrain.train_batch_triple_init(train_image_file, tp)

    # set some things we need to set.
    loss_avg = 0.0
    cstart = 0.0

    print("GPU:{} PREFETCH TRAIN".format(gpus[rank]))

    start_iter = True
    layer_loss = 0

    vis_fig = False
    vis_ax = False

    plot_fig = False
    plot_ax = False

    print("GPU:{} START LOOP".format(gpus[rank]))
    '''
    This is our main training loop. From here on out we will stay in this loop until exit. Most of the code here is for
    display and control. train_batch_triple is the only thing that needs to be called to train the network. 
    '''
    while True:

        i = int(solver.iter)
        display = False

        # Do we compute display timing data this iteration?
        if (i % display_iters == 0 or start_iter):
            cend = time.time()
            timer = cend - cstart
            cstart = cend

            # It's annoying and useless to print stats like this on the first iter
            if not start_iter:
                t = timer / float(display_iters)
                # Only once process prints this stuff out.
                if rank == 0:
                    print("GPU:{} ({}) {} ".format(gpus[rank], i, condition))
                    print("GPU:{} Average TIME {}".format(gpus[rank], t))

                display = True

        # run the actual training step on Caffe. Get back a run handle r and performance data
        layer_loss, _, _, batch_toggle, r, do_exit = PrefetchTrain.train_batch_triple(
            batch_toggle, f, tp, r)

        if do_exit == True: proc_comm[2] = True

        # compute a running average over loss
        if start_iter:
            loss_avg = layer_loss
        else:
            loss_avg = (layer_loss + loss_avg * loss_lambda) / (1.0 +
                                                                loss_lambda)

        avg_guys[rank] = loss_avg

        # Update the figure showing the first layer filters. Only one process does this.
        if display and rank == 0:
            # check if we have an x server connection to output to
            if PrefetchTrain.check_X_is_running():
                vis_fig, vis_ax = PrefetchTrain.vis_square(
                    solver.net.params[start_layer_vis][0].data, condition,
                    vis_fig, vis_ax, True, fig_name_sqr)

        # when we reach the right iteration, we will test the network and plot the performance
        if (rank == 0) or i == int(max_iters):

            if (i != 0 and i % test_iters == 0) or i == int(max_iters):
                print("TESTING")
                # Get weights over
                solver.test_nets[0].share_with(solver.net)

                # Run the test network
                correct_p, do_exit = PrefetchTrain.test_batch_context_triple(
                    test_image_file, test_transformer_1, test_transformer_2,
                    test_transformer_3, tp)

                # Plot the results of the test.
                plot_fig, plot_ax = PrefetchTrain.mr_plot(correct_p,
                                                          i,
                                                          fig_prop,
                                                          plot_fig,
                                                          plot_ax,
                                                          fig_name_err,
                                                          condition,
                                                          tp=tp)

                if do_exit == True: proc_comm[2] = True

        # one process will collect and display loss over all GPU processes.
        if display:
            #print("GPU:{} Average LOSS {}".format(gpus[rank],loss_avg))
            if rank == 0:
                avg = 0.0
                for ar in avg_guys:
                    avg += ar

                avg /= len(avg_guys)

                print("GPU:{} ALL Average LOSS {}".format(gpus[rank], ar))

        # Exit when maximum iteration is reached.
        if i == int(max_iters):
            print("GPU:{} Reaches Maxed Iters".format(gpus[rank]))
            break

        # Exit on ctrl-c
        if FINISH_EXIT:
            print("GPU:{} Got CTRL-C. Exiting ...".format(gpus[rank]))
            break

        if proc_comm[2] == True:
            print("GPU:{} Got ERROR. Exiting ...".format(gpus[rank]))
            return

        start_iter = False

    # When we exit, we always save the current state. Only one process does this.
    if rank == 0:
        # just in case
        solver.snapshot()

        print('done : Saving and exiting ...')