Beispiel #1
0
    def step(self, grad_scale=1):
        def bk(g):
            return g.backward()

        l2norm_square = parallel_apply(
            [bk for _ in self.sub_optimizers],
            self.sub_optimizers,
            devices=[g.device for g in self.sub_optimizers])
        l2norm = sum(l2norm_square)**0.5

        if str(l2norm) in ['inf', 'nan']:
            return False

        if grad_scale != 1:
            l2norm *= grad_scale
        coef = self.grad_clip_norm / (l2norm + 1e-6)

        if coef < 1:
            grad_scale = grad_scale * coef
        if grad_scale != 1:
            for n, p in self.named_parameters:
                if p.grad is not None:
                    p.grad.mul_(grad_scale)

        def st(g):
            return g.step(l2norm)

        parallel_apply([st for _ in self.sub_optimizers],
                       self.sub_optimizers,
                       devices=[g.device for g in self.sub_optimizers])

        return True
Beispiel #2
0
    def forward(self, x_gate, x_experts):
        x = self.sparse_gate(x_gate)
        selected_experts = (x != 0).nonzero()  # ret: tuple of x's, y's, z's (indices) where x is not 0
        inputs_for_experts = []
        batch_indices_for_experts = []

        for i in range(len(self.experts)):
            expert_was_selected = selected_experts[:, 1] == i
            batch_index_for_expert = selected_experts[expert_was_selected, 0]
            batch_indices_for_experts.append(batch_index_for_expert)
            inputs = None if len(batch_index_for_expert) == 0 else x_experts[batch_index_for_expert]
            inputs_for_experts.append(inputs)

        experts_to_run = []
        inputs_to_feed = []
        batch_indices_to_scatter = []
        expert_run_to_orig_index = []
        for i, (expert, inputs, batch_index) in enumerate(
            zip(self.experts, inputs_for_experts, batch_indices_for_experts)):
            if len(batch_index) > 0:
                experts_to_run.append(expert)
                inputs_to_feed.append(inputs.unsqueeze(0))
                batch_indices_to_scatter.append(batch_index)
                expert_run_to_orig_index.append(i)

        if self._parallel_apply:
            res = parallel_apply(experts_to_run, inputs_to_feed)
            # If the number of selected experts is very large then we can do it in parallel
            if self._parallel_sum:
                def scatter_batch(r, indices_to_scatter, i):
                    output = x.new_full((x_gate.shape[0],) + r.shape[1:], 0)
                    attention = x[(indices_to_scatter, expert_run_to_orig_index[i]) + (None,) * (len(r.shape) - 1)]
                    output[indices_to_scatter] += attention * r
                    return output

                output_ = parallel_apply([scatter_batch] * len(res),
                                         [(r, idx, i) for i, (r, idx) in enumerate(zip(res, batch_indices_to_scatter))])
                output = torch.sum(torch.stack(output_, dim=0), dim=0)
            else:
                output = x.new_full((x_gate.shape[0],) + res[0].shape[1:], 0)
                for i, (indices_to_scatter, r) in enumerate(zip(batch_indices_to_scatter, res)):
                    attention = x[(indices_to_scatter, expert_run_to_orig_index[i]) + (None,) * (len(r.shape) - 1)]
                    output[indices_to_scatter] += attention * r
        else:
            res = []
            for expert, inputs in zip(experts_to_run, inputs_to_feed):
                res.append(expert(inputs.squeeze(0)))
            output = x.new_full((x_gate.shape[0],) + res[0].shape[1:], 0)
            for i, (indices_to_scatter, r) in enumerate(zip(batch_indices_to_scatter, res)):
                attention = x[(indices_to_scatter, expert_run_to_orig_index[i]) + (None,) * (len(r.shape) - 1)]
                output[indices_to_scatter] += attention * r

        self.last_experts = x.cpu().detach()
        self.output_mean = output.mean().cpu().detach()
        # self.output_std = output.mean().cpu().detach()
        return output
Beispiel #3
0
    def test_parallel_apply_passes_exception(self):
        # we define and instantiate a module that will throw a KeyError
        class TestModule(nn.Module):
            def forward(self, *args):
                return {}['wonderful']

        l1 = TestModule().to("cuda", torch.float)
        # and check that parallel_apply passes on the exception
        # (we can use a single device twice for this test)
        with self.assertRaisesRegex(
                KeyError, 'Caught KeyError in replica \\d '
                'on device 0.\nOriginal Traceback'
                '[\\s\\S]+wonderful'):
            dp.parallel_apply(modules=(l1, l1), inputs=(None, None))
Beispiel #4
0
def data_parallel(f,
                  input,
                  params,
                  stats,
                  mode,
                  device_ids,
                  output_device=None):
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, stats, mode)

    def replicate(param_dict, g):
        replicas = [{} for d in device_ids]
        for k, v in param_dict.iteritems():
            for i, u in enumerate(g(v)):
                replicas[i][k] = u
        return replicas

    params_replicas = replicate(params, lambda x: Broadcast(device_ids)(x))
    stats_replicas = replicate(stats, lambda x: comm.broadcast(x, device_ids))

    replicas = [
        lambda x, p=p, s=s, mode=mode: f(x, p, s, mode)
        for i, (p, s) in enumerate(zip(params_replicas, stats_replicas))
    ]
    inputs = scatter(input, device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
Beispiel #5
0
 def calc_distill_loss(self):
     losses = []
     for i, netA in enumerate(self.netAs):
         assert isinstance(netA, SuperConv2d)
         n = self.mapping_layers[i]
         netA_replicas = replicate(netA, self.gpu_ids)
         kwargs = tuple([{
             'config': {
                 'channel': netA.out_channels
             }
         } for idx in self.gpu_ids])
         Sacts = parallel_apply(
             netA_replicas,
             tuple([
                 self.Sacts[key] for key in sorted(self.Sacts.keys())
                 if n in key
             ]), kwargs)
         Tacts = [
             self.Tacts[key] for key in sorted(self.Tacts.keys())
             if n in key
         ]
         loss = [F.mse_loss(Sact, Tact) for Sact, Tact in zip(Sacts, Tacts)]
         loss = gather(loss, self.gpu_ids[0]).sum()
         setattr(self, 'loss_G_distill%d' % i, loss)
         losses.append(loss)
     return sum(losses)
    def forward(self, inputs, im_info, gt_boxes, num_boxes, Ms, Ns):

        #tensors,_=scatter_kwargs([inputs,im_info,gt_boxes,num_boxes], {}, self.device_ids)

        inputs_multi = comm.scatter(inputs, self.device_ids)
        im_info = comm.scatter(im_info, self.device_ids)
        gt_boxes = comm.scatter(gt_boxes, self.device_ids)
        num_boxes = comm.scatter(num_boxes, self.device_ids)
        #im_info, gt_boxes, num_boxes

        tensors = parallel_apply(self.modules, [(v, ) for v in inputs_multi],
                                 devices=self.device_ids)
        out = []

        for i, tensor in enumerate(tensors):
            with torch.cuda.device(tensor.get_device()):
                tensors[i] = tensors[i].view(
                    tensors[i].size(0),
                    tensors[i].size(1) * tensors[i].size(2),
                    tensors[i].size(3), tensors[i].size(4))
                tensors[i] = tensors[i][:, :, :Ms, :Ns]
                tensors[i] = tensors[i].contiguous()
                tensors[i] = Variable(tensors[i])
                out.append([
                    tensors[i], im_info[i].cuda(), gt_boxes[i].cuda(),
                    num_boxes[i].cuda()
                ])

        return out  #tensors,im_info, gt_boxes, num_boxes
Beispiel #7
0
def data_parallel(batch_group: List[TensorDict], model: Model,
                  cuda_devices: List) -> Dict[str, torch.Tensor]:
    """
    Performs a forward pass using multiple GPUs.  This is a simplification
    of torch.nn.parallel.data_parallel to support the allennlp model
    interface.
    """
    assert len(batch_group) <= len(cuda_devices)

    moved = [
        nn_util.move_to_device(batch, device)
        for batch, device in zip(batch_group, cuda_devices)
    ]

    used_device_ids = cuda_devices[:len(moved)]
    # Counterintuitively, it appears replicate expects the source device id to be the first element
    # in the device id list. See torch.cuda.comm.broadcast_coalesced, which is called indirectly.
    replicas = replicate(model, used_device_ids)

    # We pass all our arguments as kwargs. Create a list of empty tuples of the
    # correct shape to serve as (non-existent) positional arguments.
    inputs = [()] * len(batch_group)
    outputs = parallel_apply(replicas, inputs, moved, used_device_ids)

    # Only the 'loss' is needed.
    # a (num_gpu, ) tensor with loss on each GPU
    losses = gather([output['loss'].unsqueeze(0) for output in outputs],
                    used_device_ids[0], 0)
    return {'loss': losses.mean()}
def data_parallel(f, input, params, stats, mode, device_ids, output_device=None):
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1: # only 1 device 
        return f(input, params, stats, mode)
    
    # function inside data_parallel 
    def replicate(param_dict, g):
        replicas = [{} for d in device_ids]  # replicas, list of n_devices dict
        for k,v in param_dict.iteritems():  # v is parameter
            for i,u in enumerate(g(v)):
                replicas[i][k] = u
        return replicas
    
    # broadcast parameters 
    params_replicas = replicate(params, lambda x: Broadcast(device_ids)(x))
    # broadcast stats 
    stats_replicas = replicate(stats, lambda x: comm.broadcast(x, device_ids))

    replicas = [lambda x,p=p,s=s,mode=mode: f(x,p,s,mode)
            for i,(p,s) in enumerate(zip(params_replicas, stats_replicas))]

    inputs = scatter(input, device_ids)

    outputs = parallel_apply(replicas, inputs)

    return gather(outputs, output_device)
Beispiel #9
0
def data_parallel(f,
                  input,
                  params,
                  stats,
                  mode,
                  device_ids,
                  output_device=None):
    assert isinstance(device_ids, list)
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, stats, mode)

    params_all = Broadcast.apply(device_ids, *params.values())
    params_replicas = [{
        k: params_all[i + j * len(params)]
        for i, k in enumerate(params.keys())
    } for j in range(len(device_ids))]
    stats_replicas = [
        dict(zip(stats.keys(), p))
        for p in comm.broadcast_coalesced(list(stats.values()), device_ids)
    ]

    replicas = [
        partial(f, params=p, stats=s, mode=mode)
        for p, s in zip(params_replicas, stats_replicas)
    ]
    inputs = scatter([input], device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
    def forward(self, inputs):
        inputs_multi = comm.scatter(inputs, self.device_ids)
        tensors = parallel_apply(self.modules, [(v, ) for v in inputs_multi],
                                 devices=self.device_ids)
        out = []

        for i, tensor in enumerate(tensors):
            with torch.cuda.device(tensor.get_device()):
                tensors[i] = torch.autograd.Variable(tensors[i])
                out.append([tensors[i]])

        return out
Beispiel #11
0
    def forward(self, inputs, **kwargs):
        outputs = parallel_apply(self.nets,
                                 [torch.cat(tup, dim=1) for tup in inputs],
                                 devices=list(range(len(self.nets))))
        #outputs = []

        #for net in self.nets:
        #out = net.forward(flat_inputs, **kwargs)
        #outputs.append(out)

        flat_outputs = torch.cat(outputs, dim=1)

        return flat_outputs
Beispiel #12
0
    def _data_parallel(self, batch):
        """
        Do the forward pass using multiple GPUs.  This is a simplification
        of torch.nn.parallel.data_parallel to support the allennlp model
        interface.
        """
        inputs, module_kwargs = scatter_kwargs((), batch, self._cuda_devices, 0)
        used_device_ids = self._cuda_devices[:len(inputs)]
        replicas = replicate(self._model, used_device_ids)
        outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)

        # Only the 'loss' is needed.
        # a (num_gpu, ) tensor with loss on each GPU
        losses = gather([output['loss'].unsqueeze(0) for output in outputs], used_device_ids[0], 0)
        return {'loss': losses.mean()}
Beispiel #13
0
    def _data_parallel(self, batch):
        """
        Do the forward pass using multiple GPUs.  This is a simplification
        of torch.nn.parallel.data_parallel to support the allennlp model
        interface.
        """
        inputs, module_kwargs = scatter_kwargs((), batch, self._cuda_devices, 0)
        used_device_ids = self._cuda_devices[:len(inputs)]
        replicas = replicate(self._model, used_device_ids)
        outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)

        # Only the 'loss' is needed.
        # a (num_gpu, ) tensor with loss on each GPU
        losses = gather([output['loss'] for output in outputs], used_device_ids[0], 0)
        return {'loss': losses.mean()}
Beispiel #14
0
    def test_parallel_apply(self):
        l1 = nn.Linear(10, 5).float().cuda(0)
        l2 = nn.Linear(10, 5).float().cuda(1)
        i1 = Variable(torch.randn(2, 10).float().cuda(0))
        i2 = Variable(torch.randn(2, 10).float().cuda(1))
        expected1 = l1(i1).data
        expected2 = l2(i2).data
        inputs = (i1, i2)
        modules = (l1, l2)
        expected_outputs = (expected1, expected2)
        outputs = dp.parallel_apply(modules, inputs)
        for out, expected in zip(outputs, expected_outputs):
            self.assertEqual(out.data, expected)

        inputs = (i1, Variable(i2.data.new()))
        expected_outputs = (expected1, expected2.new())
Beispiel #15
0
    def test_parallel_apply(self):
        l1 = nn.Linear(10, 5).to("cuda:0", torch.float)
        l2 = nn.Linear(10, 5).to("cuda:1", torch.float)
        i1 = torch.randn(2, 10, device="cuda:0", dtype=torch.float)
        i2 = torch.randn(2, 10, device="cuda:1", dtype=torch.float)
        expected1 = l1(i1)
        expected2 = l2(i2)
        modules = (l1, l2)
        expected_outputs = (expected1, expected2)

        # each input can be either a collection of positional arguments
        #                       or an object representing the single argument
        for inputs in [((i1, ), (i2, )), (i1, i2)]:
            outputs = dp.parallel_apply(modules, inputs, None)
            for out, expected in zip(outputs, expected_outputs):
                self.assertEqual(out, expected)
Beispiel #16
0
def data_parallel(batch, model: Model, cuda_devices: List) -> Dict[str, torch.Tensor]:
    """
    Performs a forward pass using multiple GPUs.  This is a simplification
    of torch.nn.parallel.data_parallel to support the allennlp model
    interface.
    """
    inputs, module_kwargs = scatter_kwargs((), batch, cuda_devices, 0)

    used_device_ids = cuda_devices[:len(inputs)]
    replicas = replicate(model, used_device_ids)
    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)

    # Only the 'loss' is needed.
    # a (num_gpu, ) tensor with loss on each GPU
    losses = gather([output['loss'].unsqueeze(0) for output in outputs], used_device_ids[0], 0)
    return {'loss': losses.mean()}
Beispiel #17
0
def allen_data_parallel(batch_group: List[TensorDict], model: Model,
                        cuda_devices: List) -> Dict[str, torch.Tensor]:
    """
    Performs a forward pass using multiple GPUs.  This is a simplification
    of torch.nn.parallel.data_parallel to support the allennlp model
    interface.
    """
    assert len(batch_group) <= len(cuda_devices)

    moved = [
        move_to_device(batch, device)
        for batch, device in zip(batch_group, cuda_devices)
    ]

    used_device_ids = cuda_devices[:len(moved)]
    # Counterintuitively, it appears replicate expects the source device id to be the first element
    # in the device id list. See torch.cuda.comm.broadcast_coalesced, which is called indirectly.
    replicas = nnP.replicate(model, used_device_ids)

    # We pass all our arguments as kwargs. Create a list of empty tuples of the
    # correct shape to serve as (non-existent) positional arguments.
    inputs = [()] * len(batch_group)
    outputs = nnP.parallel_apply(replicas, inputs, moved, used_device_ids)

    # Only the 'loss' is needed.
    # a (num_gpu, ) tensor with loss on each GPU
    if LOSS_KEY in outputs[0]:
        result = {
            LOSS_KEY:
            nnP.gather([output[LOSS_KEY].unsqueeze(0) for output in outputs],
                       target_device=used_device_ids[0],
                       dim=0).mean()
        }
    else:
        result = {}

    for key in outputs[0]:
        if key == 'tags':
            result[key] = list(chain([output[key] for output in outputs]))
        elif key != LOSS_KEY:
            result[key] = [
                nnP.gather([output[key]],
                           target_device=used_device_ids[0],
                           dim=0) for output in outputs
            ]
    return result
def data_parallel(f, input, params, mode, device_ids, output_device=None):
    assert isinstance(device_ids, list)
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, mode)

    params_all = Broadcast.apply(device_ids, *params.values())
    params_replicas = [{k: params_all[i + j*len(params)] for i, k in enumerate(params.keys())}
                       for j in range(len(device_ids))]

    replicas = [partial(f, params=p, mode=mode)
                for p in params_replicas]
    inputs = scatter([input], device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
Beispiel #19
0
def data_parallel(f, input, params, mode, device_ids, output_device=None):
    device_ids = list(device_ids)
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, mode)

    params_all = Broadcast.apply(device_ids, *params.values())
    params_replicas = [{
        k: params_all[i + j * len(params)]
        for i, k in enumerate(params.keys())
    } for j in range(len(device_ids))]

    replicas = [partial(f, params=p, mode=mode) for p in params_replicas]
    inputs = scatter([input], device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
Beispiel #20
0
	def forward(self, *inputs, **kwargs):
		inputs = scatter(inputs, self.device_ids, dim=0)
		kwargs = scatter(kwargs, self.device_ids, dim=0)
		replicas = replicate(self.network, self.device_ids[:len(inputs)])
		outputs = parallel_apply(replicas, inputs, kwargs)
		outputs = list(zip(*outputs))

		res = []
		for i in range(len(outputs)):
			buf = []
			for j in range(len(outputs[i])):
				if isinstance(outputs[i][j], int):
					if outputs[i][j]<0:
						buf.append(outputs[i][j])
				else:
					buf.append(outputs[i][j].to(self.device_ids[0]))
			res.append(buf)
		return res
Beispiel #21
0
def parallel_chain_loss(model, inputs, den_graph):
    """
    inputs: list of input tuple ((mfcc, inputs), supervision) on different gpus
    """
    from torch.nn.parallel import replicate, parallel_apply, gather
    model = ForwardParallelChain(model, den_graph, args)
    device_ids = list(range(torch.cuda.device_count()))
    assert len(inputs) == len(device_ids)
    output_device = device_ids[0]
    used_device_ids = device_ids[:len(inputs)]
    replicas = replicate(model, used_device_ids)
    model_kwargs = None
    outputs = parallel_apply(replicas, inputs, model_kwargs, used_device_ids)
    dim = 0
    ret = gather(outputs, output_device, dim)
    loss = ret[:, 0]
    weights = ret[:, -1]
    nummerator = loss * weights
    results = ChainResults()
    results.data = ret[:, 1:].sum(dim=0)
    return numerator.sum() / weights.sum(), results
Beispiel #22
0
def main(args):
    def log_string(str):
        #        logger.info(str)
        print(str)

    '''HYPER PARAMETER'''
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    '''CREATE DIR'''
    timestr = str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))
    experiment_dir = Path('./log/')
    experiment_dir.mkdir(exist_ok=True)
    experiment_dir = experiment_dir.joinpath('part_seg')
    experiment_dir.mkdir(exist_ok=True)
    if args.log_dir is None:
        experiment_dir = experiment_dir.joinpath(timestr)
    else:
        experiment_dir = experiment_dir.joinpath(args.log_dir)
    experiment_dir.mkdir(exist_ok=True)
    checkpoints_dir = experiment_dir.joinpath('checkpoints/')
    checkpoints_dir.mkdir(exist_ok=True)
    log_dir = experiment_dir.joinpath('logs/')
    log_dir.mkdir(exist_ok=True)
    '''LOG'''
    args = parse_args()
    logger = logging.getLogger("Model")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler = logging.FileHandler('%s/%s.txt' % (log_dir, args.model))
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    log_string('PARAMETER ...')
    log_string(args)

    root = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/'
    #    file_list = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/train2.list'
    val_list = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/val2.list'
    #    TRAIN_DATASET = KittiDataset(root = root, file_list=file_list, npoints=args.npoint, training=True, augment=True)
    #    trainDataLoader = torch.utils.data.DataLoader(TRAIN_DATASET, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=2)
    TEST_DATASET = KittiDataset(root=root,
                                file_list=val_list,
                                npoints=args.npoint,
                                training=False,
                                augment=False)
    testDataLoader = torch.utils.data.DataLoader(TEST_DATASET,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 drop_last=True,
                                                 num_workers=2)
    #    log_string("The number of training data is: %d" % len(TRAIN_DATASET))
    log_string("The number of test data is: %d" % len(TEST_DATASET))
    #    num_classes = 16

    num_devices = args.num_gpus  #torch.cuda.device_count()
    #    assert num_devices > 1, "Cannot detect more than 1 GPU."
    #    print(num_devices)
    devices = list(range(num_devices))
    target_device = devices[0]

    #    MODEL = importlib.import_module(args.model)

    net = UNet(4, 20, nPlanes)

    #    net = MODEL.get_model(num_classes, normal_channel=args.normal)
    net = net.to(target_device)

    try:
        checkpoint = torch.load(
            str(experiment_dir) + '/checkpoints/best_model.pth')
        start_epoch = checkpoint['epoch']
        net.load_state_dict(checkpoint['model_state_dict'])
        log_string('Use pretrain model')
    except:
        log_string('No existing model, starting training from scratch...')
        quit()

    if 1:

        with torch.no_grad():
            net.eval()
            evaluator = iouEval(num_classes, ignore)

            evaluator.reset()
            #            for iteration, (points, target, ins, mask) in tqdm(enumerate(testDataLoader), total=len(testDataLoader), smoothing=0.9):
            for iteration, (points, target, ins,
                            mask) in enumerate(testDataLoader):
                evaone = iouEval(num_classes, ignore)
                evaone.reset()
                cur_batch_size, NUM_POINT, _ = points.size()

                if iteration > 128:
                    break

                inputs, targets, masks = [], [], []
                coords = []
                for i in range(num_devices):
                    start = int(i * (cur_batch_size / num_devices))
                    end = int((i + 1) * (cur_batch_size / num_devices))
                    with torch.cuda.device(devices[i]):
                        pc = points[start:end, :, :].to(devices[i])
                        #feas = points[start:end,:,3:].to(devices[i])
                        targeti = target[start:end, :].to(devices[i])
                        maski = mask[start:end, :].to(devices[i])

                        locs, feas, label, maski, offsets = input_layer(
                            pc, targeti, maski, scale.to(devices[i]),
                            spatialSize.to(devices[i]), True)
                        #                        print(locs.size(), feas.size(), label.size(), maski.size(), offsets.size())
                        org_coords = locs[1]
                        label = Variable(label, requires_grad=False)

                        inputi = ME.SparseTensor(feas.cpu(), locs[0].cpu())
                        inputs.append([inputi.to(devices[i]), org_coords])
                        targets.append(label)
                        masks.append(maski)

                replicas = parallel.replicate(net, devices)
                outputs = parallel.parallel_apply(replicas,
                                                  inputs,
                                                  devices=devices)

                seg_pred = outputs[0].cpu()
                mask = masks[0].cpu()
                target = targets[0].cpu()
                loc = locs[0].cpu()
                for i in range(1, num_devices):
                    seg_pred = torch.cat((seg_pred, outputs[i].cpu()), 0)
                    mask = torch.cat((mask, masks[i].cpu()), 0)
                    target = torch.cat((target, targets[i].cpu()), 0)

                seg_pred = seg_pred[target > 0, :]
                target = target[target > 0]
                _, seg_pred = seg_pred.data.max(1)  #[1]

                target = target.data.numpy()

                evaluator.addBatch(seg_pred, target)

                evaone.addBatch(seg_pred, target)
                cur_accuracy = evaone.getacc()
                cur_jaccard, class_jaccard = evaone.getIoU()
                print('%.4f %.4f' % (cur_accuracy, cur_jaccard))

            m_accuracy = evaluator.getacc()
            m_jaccard, class_jaccard = evaluator.getIoU()

            log_string('Validation set:\n'
                       'Acc avg {m_accuracy:.3f}\n'
                       'IoU avg {m_jaccard:.3f}'.format(m_accuracy=m_accuracy,
                                                        m_jaccard=m_jaccard))
            # print also classwise
            for i, jacc in enumerate(class_jaccard):
                if i not in ignore:
                    log_string(
                        'IoU class {i:} [{class_str:}] = {jacc:.3f}'.format(
                            i=i,
                            class_str=class_strings[class_inv_remap[i]],
                            jacc=jacc))
Beispiel #23
0
    def forward(self, x, label, **kwargs):
        if self.gpus is None:
            # cpu mode, normal fc layer
            x = classify(x, self.weight, label, simple_output=True, **kwargs)
            with torch.no_grad():
                acc = accuracy(x, label)
            x = F.log_softmax(x, dim=1)
            label = label.unsqueeze(-1)
            loss = torch.gather(x, 1, label)
            loss = -loss.mean()
            return loss, acc
        else:
            weight_scattered = (w.to(i)
                                for w, i in zip(self.weights, self.gpus))
            feat_copies = [x.to(i) for i in self.gpus]
            labels_scattered = []
            for i in range(len(self.weights)):
                labels_new = label.clone()
                labels_new[(labels_new >= self.weight_idx[i + 1]) |
                           (labels_new < self.weight_idx[i])] = -1
                labels_new = labels_new - self.weight_idx[i]
                labels_scattered.append(labels_new)
            kwargs_scattered = scatter(kwargs, self.gpus)
            input_scattered = list(
                zip(feat_copies, weight_scattered, labels_scattered))
            modules = [classify] * len(self.weights)
            results_scattered = parallel_apply(modules, input_scattered,
                                               kwargs_scattered, self.gpus)

            logits = [i[0] for i in results_scattered]
            xexps = [i[1] for i in results_scattered]
            sums = [i[2] for i in results_scattered]
            argmaxs = [i[3] for i in results_scattered]
            maxs = [i[4] for i in results_scattered]

            sums = gather(sums, 0, dim=1)
            sums = sums.sum(dim=1, keepdim=True)
            sums_scattered = [sums.to(i) for i in self.gpus]
            loss_input_scattered = list(
                zip(logits, xexps, labels_scattered, sums_scattered))
            loss_results_scattered = parallel_apply(
                [nllDistributed] * len(self.gpus), loss_input_scattered, None,
                self.gpus)
            loss_results_scattered = [i.sum() for i in loss_results_scattered]

            loss_results_scattered = [i.to(0) for i in loss_results_scattered]
            loss = sum(loss_results_scattered)
            loss = loss / x.shape[0]

            for i in range(len(argmaxs)):
                argmaxs[i] = argmaxs[i] + self.weight_idx[i]
            maxs = [i.to(0) for i in maxs]
            maxs = torch.stack(maxs, dim=1)

            _, max_split = torch.max(maxs, dim=1)
            idx = torch.arange(0, maxs.size(0), dtype=torch.long)
            argmaxs = [i.to(0) for i in argmaxs]
            argmaxs = torch.stack(argmaxs, dim=1)
            predicted = argmaxs[idx, max_split]

            total = label.size(0)
            predicted = predicted.cpu()
            label = label.cpu()
            correct = (predicted == label).sum().item()
            acc = correct / total

            return loss, acc
Beispiel #24
0
def main(args):
    def log_string(str):
        logger.info(str)
        print(str)

    '''HYPER PARAMETER'''
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    '''CREATE DIR'''
    timestr = str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))
    experiment_dir = Path('./log/')
    experiment_dir.mkdir(exist_ok=True)
    experiment_dir = experiment_dir.joinpath('part_seg')
    experiment_dir.mkdir(exist_ok=True)
    if args.log_dir is None:
        experiment_dir = experiment_dir.joinpath(timestr)
    else:
        experiment_dir = experiment_dir.joinpath(args.log_dir)
    experiment_dir.mkdir(exist_ok=True)
    checkpoints_dir = experiment_dir.joinpath('checkpoints/')
    checkpoints_dir.mkdir(exist_ok=True)
    log_dir = experiment_dir.joinpath('logs/')
    log_dir.mkdir(exist_ok=True)
    '''LOG'''
    args = parse_args()
    logger = logging.getLogger("Model")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler = logging.FileHandler('%s/%s.txt' % (log_dir, args.model))
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    log_string('PARAMETER ...')
    log_string(args)

    root = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/'
    file_list = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/train2.list'
    val_list = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/val2.list'
    TRAIN_DATASET = KittiDataset(root=root,
                                 file_list=file_list,
                                 npoints=args.npoint,
                                 training=True,
                                 augment=True)
    trainDataLoader = torch.utils.data.DataLoader(TRAIN_DATASET,
                                                  batch_size=args.batch_size,
                                                  shuffle=True,
                                                  drop_last=True,
                                                  num_workers=2)
    TEST_DATASET = KittiDataset(root=root,
                                file_list=val_list,
                                npoints=args.npoint,
                                training=False,
                                augment=False)
    testDataLoader = torch.utils.data.DataLoader(TEST_DATASET,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 drop_last=True,
                                                 num_workers=2)
    log_string("The number of training data is: %d" % len(TRAIN_DATASET))
    log_string("The number of test data is: %d" % len(TEST_DATASET))
    #    num_classes = 16
    '''MODEL LOADING'''

    shutil.copy('models/%s.py' % args.model, str(experiment_dir))
    shutil.copy('models/pointnet_util.py', str(experiment_dir))

    num_devices = args.num_gpus  #torch.cuda.device_count()
    #    assert num_devices > 1, "Cannot detect more than 1 GPU."
    #    print(num_devices)
    devices = list(range(num_devices))
    target_device = devices[0]

    #    MODEL = importlib.import_module(args.model)

    net = FusionNet(args.npoint, 4, 20, nPlanes)

    #    net = MODEL.get_model(num_classes, normal_channel=args.normal)
    net = net.to(target_device)

    def weights_init(m):
        classname = m.__class__.__name__
        if classname.find('Conv2d') != -1:
            if m.weight is not None:
                torch.nn.init.xavier_normal_(m.weight.data)
            if m.bias is not None:
                torch.nn.init.constant_(m.bias.data, 0.0)
        elif classname.find('Linear') != -1:
            if m.weight is not None:
                torch.nn.init.xavier_normal_(m.weight.data)
            if m.bias is not None:
                torch.nn.init.constant_(m.bias.data, 0.0)

    try:
        checkpoint = torch.load(
            str(experiment_dir) + '/checkpoints/best_model.pth')
        start_epoch = checkpoint['epoch']
        net.load_state_dict(checkpoint['model_state_dict'])
        log_string('Use pretrain model')
    except:
        log_string('No existing model, starting training from scratch...')
        start_epoch = 0
        net = net.apply(weights_init)

    if args.optimizer == 'Adam':
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=args.learning_rate,
                                     betas=(0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=args.decay_rate)
    else:
        optimizer = torch.optim.SGD(net.parameters(),
                                    lr=1e-1,
                                    momentum=0.9,
                                    weight_decay=1e-4,
                                    nesterov=True)
#        optimizer = torch.optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9)

    def bn_momentum_adjust(m, momentum):
        if isinstance(m, torch.nn.BatchNorm2d) or isinstance(
                m, torch.nn.BatchNorm1d):
            m.momentum = momentum

    LEARNING_RATE_CLIP = 1e-5
    MOMENTUM_ORIGINAL = 0.1
    MOMENTUM_DECCAY = 0.5
    MOMENTUM_DECCAY_STEP = 20 / 2  # args.step_size

    best_acc = 0
    global_epoch = 0
    best_class_avg_iou = 0
    best_inctance_avg_iou = 0

    #    criterion = MODEL.get_loss()
    criterion = nn.CrossEntropyLoss()
    criterions = parallel.replicate(criterion, devices)

    # The raw version of the parallel_apply
    #    replicas = parallel.replicate(net, devices)
    #    input_coding = scn.InputLayer(dimension, torch.LongTensor(spatialSize), mode=4)

    for epoch in range(start_epoch, args.epoch):
        log_string('Epoch %d (%d/%s):' %
                   (global_epoch + 1, epoch + 1, args.epoch))
        '''Adjust learning rate and BN momentum'''

        #        lr = max(args.learning_rate * (args.lr_decay ** (epoch // args.step_size)), LEARNING_RATE_CLIP)
        #        lr = args.learning_rate * \
        #            math.exp((1 - epoch) * args.lr_decay)

        #        log_string('Learning rate:%f' % lr)

        #        for param_group in optimizer.param_groups:
        #            param_group['lr'] = lr
        #        for param_group in optimizer.param_groups:
        #            param_group['lr'] = lr

        mean_correct = []
        if 1:
            momentum = MOMENTUM_ORIGINAL * (MOMENTUM_DECCAY
                                            **(epoch // MOMENTUM_DECCAY_STEP))
            if momentum < 0.01:
                momentum = 0.01
            print('BN momentum updated to: %f' % momentum)
            net = net.apply(lambda x: bn_momentum_adjust(x, momentum))
        '''learning one epoch'''
        net.train()

        #        for iteration, data in tqdm(enumerate(trainDataLoader), total=len(trainDataLoader), smoothing=0.9):
        for iteration, data in enumerate(trainDataLoader):
            #adjust learing rate.
            if (iteration) % 320 == 0:
                lr_count = epoch * 6 + (iteration) / 320
                lr = args.learning_rate * math.exp(
                    (1 - lr_count) * args.lr_decay)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                log_string('Learning rate:%f' % lr)

            optimizer.zero_grad()
            if iteration > 1920:
                break
            points, target, ins, mask = data
            #            print(torch.max(points[:, :, :3], 1)[0])
            #            print(torch.min(points[:, :, :3], 1)[0])

            valid = mask > 0
            total_points = valid.sum()
            orgs = points
            points = points.data.numpy()
            #            print(total_points)
            inputs, targets, masks = [], [], []
            coords = []
            for i in range(num_devices):
                start = int(i * (args.batch_size / num_devices))
                end = int((i + 1) * (args.batch_size / num_devices))
                batch = provider.transform_for_sparse(
                    points[start:end, :, :3], points[start:end, :, 3:],
                    target[start:end, :].data.numpy(),
                    mask[start:end, :].data.numpy(), scale, spatialSize)
                batch['x'][1] = batch['x'][1].type(torch.FloatTensor)
                batch['x'][0] = batch['x'][0].type(torch.IntTensor)
                batch['y'] = batch['y'].type(torch.LongTensor)

                org_xyz = orgs[start:end, :, :3].transpose(1, 2).contiguous()
                org_feas = orgs[start:end, :, 3:].transpose(1, 2).contiguous()

                label = Variable(batch['y'], requires_grad=False)
                maski = batch['mask'].type(torch.IntTensor)
                #                print(torch.max(batch['x'][0], 0)[0])
                #                print(torch.min(batch['x'][0], 0)[0])
                #                locs, feas = input_layer(batch['x'][0].to(devices[i]), batch['x'][1].to(devices[i]))
                locs, feas = input_layer(batch['x'][0].cuda(),
                                         batch['x'][1].cuda())
                #                print(locs.size(), feas.size(), batch['x'][0].size())

                #               print(inputi.size(), batch['x'][1].size())

                with torch.cuda.device(devices[i]):
                    org_coords = batch['x'][0].to(devices[i])
                    inputi = ME.SparseTensor(feas.cpu(), locs).to(
                        devices[i])  #input_coding(batch['x'])
                    org_xyz = org_xyz.to(devices[i])
                    org_feas = org_feas.to(devices[i])
                    maski = maski.to(devices[i])
                    inputs.append(
                        [inputi, org_coords, org_xyz, org_feas, maski])
                    targets.append(label.to(devices[i]))
#                    masks.append(maski.contiguous().to(devices[i]))

            replicas = parallel.replicate(net, devices)
            predictions = parallel.parallel_apply(replicas,
                                                  inputs,
                                                  devices=devices)

            count = 0
            #            print("end ...")
            results = []
            labels = []
            match = 0

            for i in range(num_devices):
                #               temp = predictions[i]['output1'].F#.view(-1, num_classes)
                temp = predictions[i]
                #                temp = output_layer(locs, predictions[i]['output1'].F, coords[i])
                temp = temp[targets[i] > 0, :]
                results.append(temp)

                temp = targets[i]
                temp = temp[targets[i] > 0]
                labels.append(temp)
                #               print(prediction2[i].size(), prediction1[i].size(), targets[i].size())
                outputi = results[
                    i]  #prediction2[i].contiguous().view(-1, num_classes)
                num_points = labels[i].size(0)
                count += num_points

                _, pred_choice = outputi.data.max(1)  #[1]
                #                print(pred_choice)
                correct = pred_choice.eq(labels[i].data).cpu().sum()
                match += correct.item()
                mean_correct.append(correct.item() / num_points)
#            print(prediction2, labels)
            losses = parallel.parallel_apply(criterions,
                                             tuple(zip(results, labels)),
                                             devices=devices)
            loss = parallel.gather(losses, target_device, dim=0).mean()
            loss.backward()
            optimizer.step()
            #            assert(count1 == count2 and total_points == count1)
            log_string(
                "===> Epoch[{}]({}/{}) Valid points:{}/{} Loss: {:.4f} Accuracy: {:.4f}"
                .format(epoch, iteration, len(trainDataLoader), count,
                        total_points, loss.item(), match / count))
#            sys.stdout.flush()
        train_instance_acc = np.mean(mean_correct)
        log_string('Train accuracy is: %.5f' % train_instance_acc)

        #        continue

        with torch.no_grad():
            net.eval()
            evaluator = iouEval(num_classes, ignore)

            evaluator.reset()
            for iteration, (points, target, ins,
                            mask) in tqdm(enumerate(testDataLoader),
                                          total=len(testDataLoader),
                                          smoothing=0.9):
                cur_batch_size, NUM_POINT, _ = points.size()
                #                points, label, target, mask = points.float().cuda(), label.long().cuda(), target.long().cuda(), mask.float().cuda()
                if iteration > 192:
                    break
                if 0:
                    points = points.data.numpy()
                    points[:, :, 0:3], norm = provider.pc_normalize(
                        points[:, :, :3], mask.data.numpy())
                    points = torch.Tensor(points)
                orgs = points
                points = points.data.numpy()
                inputs, targets, masks = [], [], []
                coords = []
                for i in range(num_devices):
                    start = int(i * (cur_batch_size / num_devices))
                    end = int((i + 1) * (cur_batch_size / num_devices))
                    batch = provider.transform_for_test(
                        points[start:end, :, :3], points[start:end, :, 3:],
                        target[start:end, :].data.numpy(),
                        mask[start:end, :].data.numpy(), scale, spatialSize)
                    batch['x'][1] = batch['x'][1].type(torch.FloatTensor)
                    batch['x'][0] = batch['x'][0].type(torch.IntTensor)
                    batch['y'] = batch['y'].type(torch.LongTensor)

                    org_xyz = orgs[start:end, :, :3].transpose(1,
                                                               2).contiguous()
                    org_feas = orgs[start:end, :,
                                    3:].transpose(1, 2).contiguous()

                    label = Variable(batch['y'], requires_grad=False)
                    maski = batch['mask'].type(torch.IntTensor)
                    locs, feas = input_layer(batch['x'][0].cuda(),
                                             batch['x'][1].cuda())
                    #                print(locs.size(), feas.size(), batch['x'][0].size())

                    #               print(inputi.size(), batch['x'][1].size())
                    with torch.cuda.device(devices[i]):
                        org_coords = batch['x'][0].to(devices[i])
                        inputi = ME.SparseTensor(feas.cpu(), locs).to(
                            devices[i])  #input_coding(batch['x'])
                        org_xyz = org_xyz.to(devices[i])
                        org_feas = org_feas.to(devices[i])
                        maski = maski.to(devices[i])
                        inputs.append(
                            [inputi, org_coords, org_xyz, org_feas, maski])
                        targets.append(label.to(devices[i]))
#                        masks.append(maski.contiguous().to(devices[i]))

                replicas = parallel.replicate(net, devices)
                outputs = parallel.parallel_apply(replicas,
                                                  inputs,
                                                  devices=devices)

                #                net = net.eval()
                #                seg_pred = classifier(points, to_categorical(label, num_classes))
                seg_pred = outputs[0].cpu()
                #                mask = masks[0].cpu()
                target = targets[0].cpu()
                loc = locs[0].cpu()
                for i in range(1, num_devices):
                    seg_pred = torch.cat((seg_pred, outputs[i].cpu()), 0)
                    #                    mask = torch.cat((mask, masks[i].cpu()), 0)
                    target = torch.cat((target, targets[i].cpu()), 0)

                seg_pred = seg_pred[target > 0, :]
                target = target[target > 0]
                _, seg_pred = seg_pred.data.max(1)  #[1]

                target = target.data.numpy()

                evaluator.addBatch(seg_pred, target)

# when I am done, print the evaluation
            m_accuracy = evaluator.getacc()
            m_jaccard, class_jaccard = evaluator.getIoU()

            log_string('Validation set:\n'
                       'Acc avg {m_accuracy:.3f}\n'
                       'IoU avg {m_jaccard:.3f}'.format(m_accuracy=m_accuracy,
                                                        m_jaccard=m_jaccard))
            # print also classwise
            for i, jacc in enumerate(class_jaccard):
                if i not in ignore:
                    log_string(
                        'IoU class {i:} [{class_str:}] = {jacc:.3f}'.format(
                            i=i,
                            class_str=class_strings[class_inv_remap[i]],
                            jacc=jacc))

        log_string('Epoch %d test Accuracy: %f  mean avg mIOU: %f' %
                   (epoch + 1, m_accuracy, m_jaccard))
        if (m_jaccard >= best_class_avg_iou):
            #            logger.info('Save model...')
            log_string('Saveing model...')
            savepath = str(checkpoints_dir) + '/best_model.pth'
            log_string('Saving at %s' % savepath)
            state = {
                'epoch': epoch,
                'train_acc': train_instance_acc,
                'test_acc': m_accuracy,
                'class_avg_iou': m_jaccard,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }
            torch.save(state, savepath)


#            log_string('Saving model....')

        if m_accuracy > best_acc:
            best_acc = m_accuracy
        if m_jaccard > best_class_avg_iou:
            best_class_avg_iou = m_jaccard

        log_string('Best accuracy is: %.5f' % best_acc)
        log_string('Best class avg mIOU is: %.5f' % best_class_avg_iou)

        global_epoch += 1
Beispiel #25
0
def train(pipeline_model, data_loader, val_data_loader, config):
    # Set up the train flag for batch normalization
    pipeline_model.train()

    num_devices = torch.cuda.device_count()
    num_devices = min(config.max_ngpu, num_devices)
    devices = list(range(num_devices))
    target_device = devices[0]
    pipeline_model.to(target_device)
    if num_devices > 1:
        pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
            pipeline_model, devices)

    # Configuration
    writer = SummaryWriter(logdir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    meters = collections.defaultdict(AverageMeter)
    hists = pipeline_model.initialize_hists()

    optimizer = pipeline_model.initialize_optimizer(config)
    scheduler = pipeline_model.initialize_scheduler(optimizer, config)

    writer = SummaryWriter(logdir=config.log_dir)

    # Train the network
    logging.info('===> Start training')
    best_val, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        if osp.isfile(config.resume):
            logging.info("=> loading checkpoint '{}'".format(config.resume))
            state = torch.load(config.resume)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            pipeline_model.load_state_dict(state['state_dict'])
            if config.resume_optimizer:
                curr_iter = state['iteration'] + 1
                scheduler = pipeline_model.initialize_scheduler(
                    optimizer, config, last_step=curr_iter)
                pipeline_model.load_optimizer(optimizer, state['optimizer'])
            if 'best_val' in state:
                best_val = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                config.resume, state['epoch']))
        else:
            logging.info("=> no checkpoint found at '{}'".format(
                config.resume))

    data_iter = data_loader.__iter__()
    while is_training:
        for iteration in range(len(data_loader)):
            pipeline_model.reset_gradient(optimizer)
            iter_timer.tic()

            pipelines = parallel.replicate(pipeline_model, devices)

            # Get training data
            data_timer.tic()
            inputs = []
            for pipeline, device in zip(pipelines, devices):
                with torch.cuda.device(device):
                    while True:
                        datum = pipeline.load_datum(data_iter, has_gt=True)
                        num_boxes = sum(box.shape[0]
                                        for box in datum['bboxes_coords'])
                        if config.skip_empty_boxes and num_boxes == 0:
                            continue
                        break
                    inputs.append(datum)
            data_time_avg.update(data_timer.toc(False))

            outputs = parallel.parallel_apply(pipelines,
                                              [(x, True) for x in inputs],
                                              devices=devices)
            losses = parallel.parallel_apply(
                [pipeline.loss for pipeline in pipelines],
                tuple(zip(inputs, outputs)),
                devices=devices)
            losses = parallel.gather(losses, target_device)
            losses = dict([(k, v.mean()) for k, v in losses.items()])

            meters, hists = pipeline_model.update_meters(meters, hists, losses)

            # Compute and accumulate gradient
            losses['loss'].backward()

            # Update number of steps
            pipeline_model.step_optimizer(losses, optimizer, scheduler,
                                          iteration)

            iter_time_avg.update(iter_timer.toc(False))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join([
                    '{:.3e}'.format(x) for x in scheduler['default'].get_lr()
                ])
                debug_str = "===> Epoch[{}]({}/{}): LR: {}\n".format(
                    epoch, curr_iter, len(data_loader), lrs)
                debug_str += log_meters(meters, log_perclass_meters=False)
                debug_str += f"\n    data time: {data_time_avg.avg:.3f}"
                debug_str += f"    iter time: {iter_time_avg.avg:.3f}"
                logging.info(debug_str)

                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()

                # Write logs
                update_writer(writer, meters, curr_iter, 'training')
                writer.add_scalar('training/learning_rate',
                                  scheduler['default'].get_lr()[0], curr_iter)

                # Reset meters
                reset_meters(meters, hists)

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(pipeline_model, optimizer, epoch, curr_iter, config,
                           best_val, best_val_iter)

            if config.heldout_save_freq > 0 and curr_iter % config.heldout_save_freq == 0:
                checkpoint(pipeline_model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val,
                           best_val_iter,
                           heldout_save=True)

            # Validation
            if curr_iter % config.val_freq == 0:
                if num_devices > 1:
                    unconvert_sync_batchnorm(pipeline_model)
                best_val, best_val_iter = validate(pipeline_model,
                                                   val_data_loader, config,
                                                   writer, curr_iter, best_val,
                                                   best_val_iter, optimizer,
                                                   epoch)
                if num_devices > 1:
                    pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
                        pipeline_model, devices)

            if curr_iter % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            # End of iteration
            curr_iter += 1

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    if num_devices > 1:
        unconvert_sync_batchnorm(pipeline_model)
    validate(pipeline_model, val_data_loader, config, writer, curr_iter,
             best_val, best_val_iter, optimizer, epoch)
    if num_devices > 1:
        pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
            pipeline_model, devices)
    checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val,
               best_val_iter)
    def forward(self,
                x,
                y=None,
                nsamples=1,
                sample_from='posterior',
                reduce='none',
                samples=None):

        if self.use_posterior:
            assert (y is not None)

        x_unet = x.to(self.devices['unet'])
        modules = [self.unet]
        inputs = [x_unet]
        devices = [self.devices['unet']]

        if self.training or sample_from == 'prior':
            x_prior = Variable(x.data.to(self.devices['prior_net']))
            modules.append(self.prior_net)
            inputs.append(x_prior)
            devices.append(self.devices['prior_net'])

        if self.training or self.use_posterior:
            x_posterior = Variable(x.data.to(self.devices['posterior_net']))
            y_posterior = Variable(y.data.to(self.devices['posterior_net']))
            posterior_in = torch.cat([x_posterior, y_posterior], dim=1)
            modules.append(self.posterior_net)
            inputs.append(posterior_in)
            devices.append(self.devices['posterior_net'])

        if self.n_unique_devices > 1:
            output = parallel_apply(modules, inputs, devices=devices)
        else:
            output = [
                module(module_input)
                for module, module_input in zip(modules, inputs)
            ]

        # Sample
        # Prior
        if sample_from == 'prior' or self.training:

            prior_params = output[1]
            if self.visualize:
                attn_blocks_prior = prior_params[2]
            prior_means = prior_params[0]
            prior_log_vars = prior_params[1]
            prior_samples = self.sample(means=prior_means,
                                        log_vars=prior_log_vars,
                                        nsamples=nsamples,
                                        sample_from='prior')

        # Posterior
        if sample_from == 'posterior' or self.training:
            # If training, a samples from prior has also been drawn
            if self.training:
                posterior_params = output[2]
            # If eval, sample from prior won't be drawn
            else:
                posterior_params = output[1]
            posterior_means = posterior_params[0]
            posterior_log_vars = posterior_params[1]
            posterior_samples = self.sample(means=posterior_means,
                                            log_vars=posterior_log_vars,
                                            nsamples=nsamples,
                                            sample_from='posterior')

        if samples is None:
            samples = posterior_samples if self.training else prior_samples

        if self.visualize:
            unet_features = output[0][0].to(self.devices['output'])
            attn_blocks_unet = output[0][1]
        else:
            unet_features = output[0].to(self.devices['output'])
        samples = samples.to(self.devices['output'])
        out = self.comb(unet_features, samples, reduce=reduce)

        if self.training or self.use_posterior:
            return out, prior_means, prior_log_vars, posterior_means, posterior_log_vars
        elif sample_from == 'prior':
            if self.visualize:
                return out, prior_means, prior_log_vars, attn_blocks_unet, attn_blocks_prior
            return out, prior_means, prior_log_vars
        else:
            return out, posterior_means, posterior_log_vars
Beispiel #27
0
 def parallel_apply(self, replicas, inputs, kargs=None):
   if kargs is not None:
     kargs = tuple(kargs for _ in inputs)
   return parallel_apply(replicas, inputs, kargs, self.device_ids[:len(replicas)])
Beispiel #28
0
 def parallel_apply(self, replicas, inputs, kwargs):
     return parallel_apply(replicas, inputs, kwargs,
                           self.device_ids[:len(replicas)])
Beispiel #29
0
        # Get new data
        inputs, all_labels = [], []
        for i in range(num_devices):
            coordinates, features, labels = generate_input(config.file_name,
                                                           voxel_size=0.05)
            with torch.cuda.device(devices[i]):
                inputs.append(
                    ME.SparseTensor(features - 0.5,
                                    coords=coordinates).to(devices[i]))
            all_labels.append(labels.long().to(devices[i]))

        # The raw version of the parallel_apply
        st = time()
        replicas = parallel.replicate(net, devices)
        outputs = parallel.parallel_apply(replicas, inputs, devices=devices)

        # Extract features from the sparse tensors to use a pytorch criterion
        out_features = [output.F for output in outputs]
        losses = parallel.parallel_apply(criterions,
                                         tuple(zip(out_features, all_labels)),
                                         devices=devices)
        loss = parallel.gather(losses, target_device, dim=0).mean()
        t = time() - st
        min_time = min(t, min_time)
        print('Iteration: ', iteration, ', Loss: ', loss.item(), ', Time: ', t,
              ', Min time: ', min_time)

        # Gradient
        loss.backward()
        optimizer.step()
Beispiel #30
0
    def train(
        self,
        train_dataset,
        *,
        progress_bar=True,
        resume=False,
        device=None,
    ):
        """
        A simplified training loop::

            for epoch in range(1, ...):
                for example in train_iterator:
                    model_out = self.model(example)
                    review = self.model.review(example, model_out)
                    review = maybe_add_loss_from_losses(review)
                    review['loss'].backward()
                    self.optimizer.step()
                    add_review_to_tensorboardX(review)

        The remaining code takes care about calling validation and save the
        result to tensorboard (if a validation_hook is registered), save
        checkpoints, cleanup checkpoints that are stale (not best according
        to metric and not last) and display a progessbar.
        The code is designed that many aspects can be customized.
        (e.g. see test_runtime_tests.py DictTrainer for multi model trainer)

        Args:
            train_iterator:
                The train_iterator is python iterable (e.g. tuple, list, ...)
                that can consumed multiple times (i.e. not generator).

                Usually it will be paderbox.database.BaseIterator that is
                returned from a database in paderbox.database.
            progress_bar: flag whether to show a progress bar or not.
            resume:
                Whether to resume a training or start a fresh one.
            device:
                Defines the device which shall be used ('cpu', 0, 1, ...).
                If None, it selects device 0 if CUDA is available and 'cpu'
                if CUDA is not available.
        """

        if torch.cuda.is_available():
            if device is None:
                device = 0
        else:
            if device is None:
                warnings.warn(
                    'CUDA is not available in this environment! The training '
                    'will run on the CPU! This might be caused by a damaged '
                    'installation or a version mismatch between PyTorch and '
                    'your CUDA installation.')
                device = 'cpu'
            elif device != 'cpu':
                raise RuntimeError(
                    'CUDA is not available in this environment, but you set '
                    'device to use a GPU! This might be caused by a damaged '
                    'installation or a version mismatch between PyTorch and '
                    'your CUDA installation.')

        if resume:
            assert resume is True, resume
            self.load_checkpoint()
        else:
            assert not self.checkpoint_dir.exists(),\
                f'A checkpoint directory already exists. If you want to ' \
                f'restart the training set resume to True.'
            self.iteration = 0
            self.epoch = 0
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = False

        # Change model to train mode (e.g. activate dropout)
        self.model.train()

        if isinstance(device, (tuple, list)):
            assert all([isinstance(d, int) for d in device]), device
            # multiple devises e.g. [0, 1], [0, 1, 2, 3], ...
            # torch.nn.parallel.DataParallel moves everything to the first gpu.
            # We do then the same thing.
            self.to(device[0])
            device = list(device)
        else:
            self.to(device)
            device = [device]

        # Reset all gradients
        self.optimizer_zero_grad()

        self.writer = self.writer_cls(str(self.storage_dir))
        hooks = [*self.hooks]
        if progress_bar:
            try:
                max_it_len = len(train_dataset)
            except TypeError:
                # TypeError: object of type '...' has no len()
                max_it_len = None
            hooks.append(ProgressBarHook(self._stop_trigger, max_it_len))
        hooks = sorted(hooks, key=lambda h: h.priority, reverse=True)

        if len(device) >= 2:
            import textwrap
            print(
                'WARNING: You called padertorch.Trainer.train with multiple\n'
                + textwrap.indent(
                    'devices. With this the trainer will use data parallel to\n'
                    'utilize the multiple GPUs to speedup your training.\n'
                    'We observed some problems with some versions of pytorch.\n'
                    'In 1.4 the performance on a NN was quite bad and accoring to\n'
                    'https://github.com/pytorch/pytorch/issues/33552\n'
                    'this was because the RNNs get no gradients.\n'
                    'In 1.5 the training got stuck, the reason is unclear in the'
                    'moment.\n'
                    'With Pytorch <= 1.3 we have not tested the code.\n'
                    f'Your pytorch version is: {torch.__version__}', ' ' *
                    len('WARNING: ')))

        assert self.virtual_minibatch_size % len(device) == 0, (
            self.virtual_minibatch_size, device)
        assert len(device) > 0, (self.virtual_minibatch_size, device)

        # ================ MAIN TRAINING LOOP! ===================
        try:
            train_iterable = None
            while True:
                new_epoch = False
                if train_iterable is None:
                    new_epoch = True

                    # Call pre_step between the epochs.
                    # We call it here, so it is done, before the iteration
                    # over the train_dataset starts.
                    for hook in hooks:
                        hook.pre_step(self)

                    train_iterable = iter(train_dataset)

                optimize = True
                with self.train_timer['time_per_iteration'] as timer:
                    for minibatch_index in range(self.virtual_minibatch_size //
                                                 len(device)):
                        with self.train_timer['time_per_data_loading']:
                            example = list(
                                itertools.islice(train_iterable, len(device)))
                            if len(example) == 0:
                                train_iterable = None
                                self.epoch += 1
                                if minibatch_index == 0:
                                    optimize = False
                                break  # end minibatch loop

                        if new_epoch:
                            new_epoch = False
                        elif minibatch_index == 0:
                            # Call pre_step after getting the next example,
                            # to correctly detect the next epoch
                            with timer.pause():
                                for hook in hooks:
                                    hook.pre_step(self)

                        if len(device) == 1:
                            assert len(example) == 1, (len(example), example)
                            example = example[0]

                            loss, example, model_output, review = \
                                self.train_step(self.model, example, device[0])

                            with timer.pause():
                                for hook in hooks:
                                    hook.post_step(self, example, model_output,
                                                   review)

                            # Release pytorch object to reduce memory footprint
                            del example
                            del model_output
                            del review

                            with self.train_timer['time_per_backward']:
                                loss.backward(retain_graph=False)
                            del loss

                        else:
                            # The data parallel idea here follows the idea from
                            # torch.nn.parallel.DataParallel.
                            # We also use the same functions
                            # (i.e. replicate, parallel_apply and gather).
                            #
                            # The difference is, that we need no scatter,
                            # because we simply use multiple examples and
                            # the gather must only be applied on the loss.

                            # Move copies of the model to each GPU
                            with self.train_timer['time_per_replicate']:
                                replicas = replicate(self.model,
                                                     device[:len(example)])

                            # Use threads to call train_step. Each thread
                            # processes one example on one GPU.
                            with self.train_timer['time_per_parallel_apply']:
                                outputs = parallel_apply(
                                    [self.train_step] * len(example),
                                    list(
                                        zip(
                                            replicas,
                                            example,
                                            device[:len(example)],
                                        )),
                                )
                            del replicas

                            # Take the sum of all losses. Since they are on
                            # different GPUs, use gather.
                            with self.train_timer['time_per_gather']:
                                loss = gather([
                                    loss.view(1) for loss, _, _, _ in outputs
                                ], device[0]).sum()

                            with timer.pause():
                                for _, example, model_output, review in outputs:
                                    for hook in hooks:
                                        hook.post_step(self, example,
                                                       model_output, review)

                            # Release pytorch object to reduce memory footprint
                            del example
                            del model_output
                            del review

                            with self.train_timer['time_per_backward']:
                                loss.backward(retain_graph=False)
                            del loss

                    # Only the summary hook will use optimizer_review
                    if optimize:
                        with self.train_timer['time_per_optimize']:
                            optimizer_summary = self.optimizer_step()
                            for hook in hooks:
                                hook.post_optimize(self, optimizer_summary)
                            del optimizer_summary

                        self.iteration += 1

        except StopTraining:
            pass
        finally:
            try:
                for hook in hooks:
                    hook.close(self)
            except Exception:
                print('Exception in finally. May hide actual exception!!!\n'
                      'You may comment this finally block for debugging.')
                raise
            self.writer.close()
            self.writer = None
Beispiel #31
0
def parallel_tensor_dict(
    tensor_dicts: List[Mapping],
    model: Model,
    device_ids: List,
    loss_key='loss',
    atom_types=(str, )) -> Dict[str, torch.Tensor]:
    """
    Performs a forward pass using multiple GPUs.  This is a simplification
    of torch.nn.parallel.data_parallel to support the allennlp model
    interface.
    """
    if len(tensor_dicts) > len(device_ids):
        raise ValueError(
            "the number of tensor dicts must be the same as the number of device ids"
        )

    # region 1 - copy data and model to multiple GPUS

    # NOTE, there can be fewer tensor dicts,
    # and in this case the number of used device ids might be less than the number of provided device ids
    moved = [
        move_tensor_dict_to_device(tensor_dict, device_id)
        for tensor_dict, device_id in zip(tensor_dicts, device_ids)
    ]
    used_device_ids = device_ids[:len(moved)]

    # must replicate the model to the GPUs every time, because its parameters have been updated
    replicas = nnP.replicate(model, used_device_ids)

    # endregion

    # region 2 - get the outputs

    # the outputs must be a dictionary of results returned by each GPU
    outputs = nnP.parallel_apply(
        replicas,
        [()] * len(tensor_dicts),  # no positional argument
        moved,  # the tensor dict as named arguments
        used_device_ids)

    # endregion

    # region 3 - gather the results on the first GPU

    result = {}
    for k, v in outputs[0].items():
        if k == loss_key:  # special treatment for the loss key
            result[k] = nnP.gather(
                [output[k].unsqueeze(0) for output in outputs],
                target_device=used_device_ids[0],
                dim=0).mean()
        else:
            if isinstance(v, torch.Tensor):
                result[k] = [
                    nnP.gather([output[k]],
                               target_device=used_device_ids[0],
                               dim=0) for output in outputs
                ]
            elif gx.iterable__(v, atom_types=atom_types):
                result[k] = tuple(chain([output[k] for output in outputs]))
            else:
                result[k] = tuple(output[k] for output in outputs)

    # endregion

    return result