Esempio n. 1
0
 def wrapper(*args, **kwargs):
     frame = currentframe()
     v = getargvalues(frame)
     argspec = getfullargspec(func)
     formal_arg_names = argspec.args
     s = "{'op':'%s'," % v.locals["func"].__name__
     for idx, val in enumerate(v.locals["args"]):
         name = "" + formal_arg_names[idx]
         if name == "self" and isinstance(val, torch.Tensor):
             s += ", shape = %s" % str(tuple(val.shape))
         if isinstance(val, torch.Tensor):
             name += "_tensor"
             value = {
                 'shape': tuple(val.size()),
                 'type': str(val.dtype).split(".")[-1]
             }
             val = value
         #   name += "'"
         s += "'%s':%s," % (name, str(val))
     num_def = len(argspec.defaults)
     defaults = dict(zip(argspec.args[-num_def:], argspec.defaults))
     overrides = {k: str(v) for k, v in v.locals["kwargs"].items()}
     defaults.update(overrides)
     s += "%s}" % str(defaults).strip("{}")
     nvtx.range_push(s)
     result = func(*args, **kwargs)
     nvtx.range_pop()
     return result
Esempio n. 2
0
def push_nvtx_model_config(config):
    """
    Helper function to dump the passed in dict config as an nvtx
    marker with "model_config" key
    """
    nvtx_msg = json.dumps({"model_config": config})
    nvtx.range_push(nvtx_msg)
Esempio n. 3
0
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            nvtx.range_push("Copy to device")
            data, target = data.to(device), target.to(device)
            nvtx.range_pop()
            # Copy to device

            nvtx.range_push("Test forward pass")
            output = model(data)
            nvtx.range_pop()  # Test forward pass

            test_loss += F.nll_loss(
                output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(
                dim=1,
                keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print(
        '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
Esempio n. 4
0
            def wrapper(input,
                        weight,
                        bias=None,
                        stride=1,
                        padding=0,
                        output_padding=0,
                        groups=1,
                        dilation=1):

                input_dict = {
                    'shape': tuple(input.size()),
                    'type': str(input.dtype).split(".")[-1]
                }
                weight_dict = {
                    'shape': tuple(weight.size()),
                    'type': str(weight.dtype).split(".")[-1]
                }
                # Interpolate numbers as strings because some can be one-elem tuples as well
                nvtx_str = "{'op':'conv_transpose%sd', 'input_tensor':%s, 'weight_tensor':%s, 'stride':%s, 'padding':%s, 'output_padding':%s, 'groups':%s, 'dilation':%s}" % (
                    dim_count, str(input_dict), str(weight_dict), str(stride),
                    str(padding), str(output_padding), str(groups),
                    str(dilation))
                nvtx.range_push(nvtx_str)
                op = fun(input, weight, bias, stride, padding, dilation,
                         groups)
                nvtx.range_pop()
                return op
Esempio n. 5
0
 def __init__(self):
     nvtx.range_push("Toymodel_layer_stack")
     super(ToyModel, self).__init__()
     self.net1 = torch.nn.Linear(100, 100).to('cuda:0')
     self.relu = torch.nn.ReLU()
     self.net2 = torch.nn.Linear(100, 50).to('cpu')
     nvtx.range_pop()
Esempio n. 6
0
def range_push(msg: str) -> None:
    r"""Annotates the start of a range for profiling. Requires HABITAT_PROFILING
    environment variable to be set, otherwise the function is a no-op. Pushes a
    range onto a stack of nested ranges. Every range_push should have a
    corresponding range_pop. Attached profilers can capture the time spent in
    ranges."
    """
    if enable_profiling:
        nvtx.range_push(msg)
Esempio n. 7
0
 def define_graph(self):
     nvtx.range_push("Reading JPEG files into host memory")
     jpegs, labels = self.input()  # read in jpeg files
     nvtx.range_pop()
     nvtx.range_push("Start mixed decoding process")
     # images = self.decode(jpegs) # Do decoding process
     decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
     images = decode(jpegs)
     nvtx.range_pop()
     return (images, labels)
Esempio n. 8
0
    def new_iter(self, *args, **kwargs):

        # Push trace marker
        nvtx.range_push(traceMarker("Dataloader"))

        # First pass is for creating the dataloader + returning the first data
        cadena = argMarker(mod, "DataLoader", args, kwargs)
        nvtx.range_push(cadena)

        for x in old_iter(self, *args, **kwargs):

            # Pop tracemarker
            nvtx.range_pop()

            # Dataloader stop, Model start
            nvtx.range_pop()

            yield x

            # Push trace marker
            nvtx.range_push(traceMarker("DataLoader"))

            # Model stop, dataloader start
            cadena = argMarker(mod, "DataLoader", args, kwargs)
            nvtx.range_push(cadena)

        # Pop the last iteration before returning
        nvtx.range_pop()
        nvtx.range_pop()
Esempio n. 9
0
def trainStandardMethod(model):
    model.train(True)
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    one_hot_indices = torch.LongTensor(batch_size) \
                           .random_(0, num_classes) \
                           .view(batch_size, 1)

    for i in range(num_batches):
        nvtx.range_push("Batch" + str(i))
        # generate random inputs and labels
        inputs = torch.randn(batch_size, 3, image_w, image_h)
        labels = torch.zeros(batch_size, num_classes) \
                      .scatter_(1, one_hot_indices, 1)
        nvtx.range_push("Copy to device")
        inputs = inputs.to('cuda:0')
        # labels = labels.to('cuda:0')
        nvtx.range_pop()
        # run forward pass
        nvtx.range_push("Forward pass")
        optimizer.zero_grad()
        outputs = model(inputs)
        nvtx.range_pop()

        # run backward pass
        # labels = labels.to(outputs.device)
        nvtx.range_push("Backward pass")
        loss_fn(outputs, labels).backward()
        torch.cuda.synchronize('cuda:0')
        optimizer.step()
        nvtx.range_pop()
        nvtx.range_pop()
Esempio n. 10
0
def range_push(msg: str) -> None:
    r"""Annotates the start of a range for profiling. Requires HABITAT_PROFILING
    environment variable to be set, otherwise the function is a no-op. Pushes a
    range onto a stack of nested ranges. Every range_push should have a
    corresponding range_pop. Attached profilers can capture the time spent in
    ranges."
    """
    if not _enable_profiling:
        return

    nvtx.range_push(msg)
    _helper.range_depth += 1
    max_depth = 64
    # In practice, there is little need to go deeper than 5 or 10. By asserting
    # here, we'll catch improper range_push/range_pop usage. Specifically,
    # we'll (eventually) catch an unmatched range_push.
    assert _helper.range_depth < max_depth
Esempio n. 11
0
def oneStepTrain(model):
    model = vgg19()
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    one_hot_indices = torch.LongTensor(batch_size) \
                           .random_(0, num_classes) \
                           .view(batch_size, 1)

    optimizer.zero_grad()
    nvtx.range_push("Copy to device")
    inputs = torch.randn(batch_size, 3, image_w, image_h).to('cuda:0')
    nvtx.range_pop()
    outputs = model(inputs)
    labels = torch.zeros(batch_size,
                         num_classes).scatter_(1, one_hot_indices, 1)
    labels = labels.to(outputs.device)
    nvtx.range_push("backward pass")
    loss_fn(outputs, labels).backward()
    nvtx.range_pop()
    optimizer.step()
Esempio n. 12
0
    def wrapper_func(*args, **kwargs):

        # Push trace marker
        nvtx.range_push(traceMarker(fn_name))

        # Push module marker
        if s:
            m = modMarker(mod, fn_name, args)
            nvtx.range_push(m)

        # Create and push argument marker
        cadena = argMarker(mod, fn_name, args, kwargs)
        nvtx.range_push(cadena)

        # Call the original function
        result = func(*args, **kwargs)

        # Pop argumet marker
        nvtx.range_pop()

        # Pop module marker
        if s:
            nvtx.range_pop()

        # Pop trace marker
        nvtx.range_pop()

        return result
Esempio n. 13
0
 def forward(self, x):
     nvtx.range_push("net1")
     x1 = self.net1(x)
     nvtx.range_pop()
     nvtx.range_push("relu1")
     x2 = self.relu(x1) 
     nvtx.range_pop()
     nvtx.range_push("Copy to cpu")
     x2 = x2.to('cpu')
     nvtx.range_pop()
     nvtx.range_push("net2")
     x3 = self.net2(x2)
     # x = self.relu(self.net1(x))
     nvtx.range_pop()
     # return self.net2(x.to('cpu'))
     return x3
Esempio n. 14
0
    def wrapper_func(*args, **kwargs):

        global wrappers_enabled
        traceMarker_str = ""
        input_callid_list = []

        if config.capture_input_ops:
            dlprof.capture_inputs(input_callid_list, *args)

        if wrappers_enabled:
            # Push trace marker
            traceMarker_str = traceMarker(fn_name)
            nvtx.range_push(traceMarker_str)

            # Push module marker
            if s:
                m = modMarker(mod, fn_name, args)
                nvtx.range_push(m)

            # Create and push argument marker
            #
            # Disable wrappers while getting the argMarker in case it
            # ends up executing another wrapped function
            wrappers_enabled = False
            if config.capture_input_ops:
                cadena = argMarker(mod, fn_name, args, kwargs, dlprof.call_id,
                                   input_callid_list)
            else:
                cadena = argMarker(mod, fn_name, args, kwargs)
            nvtx.range_push(cadena)
            wrappers_enabled = True

        # Call the original function
        result = func(*args, **kwargs)

        if wrappers_enabled:
            # Pop argumet marker
            nvtx.range_pop()

            # Pop module marker
            if s:
                nvtx.range_pop()

            # Pop trace marker
            nvtx.range_pop()

        if config.capture_input_ops:
            dlprof.capture_outputs(dlprof.call_id, result)
            # Store the callid -> op_name mapping
            if traceMarker_str is not "":
                traceMarker_str = traceMarker_str.replace("\'", "\"")
                traceMarker_dict = json.loads(traceMarker_str)
                dlprof.call_id_to_op_map[
                    dlprof.call_id] = traceMarker_dict['funcStack']
            dlprof.call_id = dlprof.call_id + 1

        return result
Esempio n. 15
0
    def run_step(self):
        """
        Implement the standard training logic described above.
        """
        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
        start = time.perf_counter()
        """
        If you want to do something with the data, you can wrap the dataloader.
        """
        nvtx.range_push("Data loading")
        # data = next(self._data_loader_iter)
        dali_data = next(self._dali_data_loader_iter)

        d_data = []
        for i in range(0, 12):
            img = dali_data[0]['image'][i]
            seg = dali_data[0]['sem_seg'][i][0].cpu().long()
            d_data.append({
                'file_name': "",
                'height': 1024,
                'width': 2048,
                'image': img,
                'sem_seg': seg
            })

        nvtx.range_pop()
        data_time = time.perf_counter() - start
        """
        If you want to do something with the losses, you can wrap the model.
        """
        nvtx.range_push("Forward pass")
        loss_dict = self.model(d_data)
        losses = sum(loss_dict.values())
        nvtx.range_pop()
        """
        If you need to accumulate gradients or do something similar, you can
        wrap the optimizer with your custom `zero_grad()` method.
        """
        nvtx.range_push("Backward pass")
        self.optimizer.zero_grad()
        losses.backward()
        nvtx.range_pop()

        # self._write_metrics(loss_dict, data_time)
        """
        If you need gradient clipping/scaling or other processing, you can
        wrap the optimizer with your custom `step()` method. But it is
        suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4
        """
        self.optimizer.step()
Esempio n. 16
0
    def train(self, start_iter: int, max_iter: int):
        """
        Args:
            start_iter, max_iter (int): See docs above
        """
        logger = logging.getLogger(__name__)
        logger.info("Starting training from iteration {}".format(start_iter))

        self.iter = self.start_iter = start_iter
        self.max_iter = max_iter

        with EventStorage(start_iter) as self.storage:
            try:
                self.before_train()
                for self.iter in range(start_iter, max_iter):
                    nvtx.range_push("Batch " + str(self.iter))
                    nvtx.range_push("Before step")
                    self.before_step()
                    nvtx.range_pop()

                    nvtx.range_push("Run step")
                    self.run_step()
                    nvtx.range_pop()

                    nvtx.range_push("After step")
                    self.after_step()
                    nvtx.range_pop()

                    nvtx.range_pop()
                # self.iter == max_iter can be used by `after_train` to
                # tell whether the training successfully finished or failed
                # due to exceptions.
                self.iter += 1
            except Exception:
                logger.exception("Exception during training:")
                raise
            finally:
                self.after_train()
Esempio n. 17
0
    def forward(self, numerical_input, categorical_inputs):
        """

        Args:
            numerical_input (Tensor): with shape [batch_size, num_numerical_features]
            categorical_inputs (Tensor): with shape [batch_size, num_categorical_features]
        """
        batch_size = numerical_input.size()[0]

        # Put indices on the same device as corresponding embedding
        device_indices = []
        for embedding_id, _ in enumerate(self.embeddings):
            device_indices.append(categorical_inputs[:, embedding_id].to(self._embedding_device_map[embedding_id]))

        nvtx.range_push("layer:Bottom_MLP")
        bottom_mlp_output = self.bottom_mlp(numerical_input)
        nvtx.range_pop()

        # embedding_outputs will be a list of (26 in the case of Criteo) fetched embeddings with shape
        # [batch_size, embedding_size]
        embedding_outputs = []
        for embedding_id, embedding in enumerate(self.embeddings):
            if self._hash_indices:
                device_indices[embedding_id] = device_indices[embedding_id] % embedding.num_embeddings

            nvtx.range_push("layer:Embedding_{}".format(embedding_id))
            embedding_outputs.append(embedding(device_indices[embedding_id]).to(self._base_device))
            nvtx.range_pop()

        nvtx.range_push("layer:Interaction")
        interaction_output = self._interaction(bottom_mlp_output, embedding_outputs, batch_size)
        nvtx.range_pop()

        nvtx.range_push("layer:Top_MLP")
        top_mlp_output = self.top_mlp(interaction_output)
        nvtx.range_pop()

        return top_mlp_output
Esempio n. 18
0
def worker(gpu, ngpus_per_node, args):
    env_device, train_device = args_initialize(gpu, ngpus_per_node, args)
    train_csv_file, train_csv_writer, eval_csv_file, eval_csv_writer, summary_writer = log_initialize(args, train_device)
    train_env, test_env, observation = env_initialize(args, env_device)

    model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name)
    model, optimizer = model_initialize(args, model, train_device)

    shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:])
    states = torch.zeros(shape, device=train_device, dtype=torch.float32)
    states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32)

    shape = (args.num_steps + 1, args.num_ales)
    values = torch.zeros(shape, device=train_device, dtype=torch.float32)
    logits = torch.zeros((args.num_steps + 1, args.num_ales, train_env.action_space.n), device=train_device, dtype=torch.float32)
    returns = torch.zeros(shape, device=train_device, dtype=torch.float32)

    shape = (args.num_steps, args.num_ales)
    rewards = torch.zeros(shape, device=train_device, dtype=torch.float32)
    masks = torch.zeros(shape, device=train_device, dtype=torch.float32)
    actions = torch.zeros(shape, device=train_device, dtype=torch.long)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
    final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
    episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
    final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)

    if args.use_gae:
        gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)

    maybe_npy = lambda a: a.numpy() if args.use_openai else a

    num_frames_per_iter = args.num_ales * args.num_steps
    args.num_minibatches = num_frames_per_iter / args.batch_size
    total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter))

    decay = 1.0 / total_steps
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.ppo_epoch, gamma=1.0 - decay)

    iterator = range(total_steps)
    if args.rank == 0:
        iterator = tqdm(iterator)
        total_time = 0
        evaluation_offset = 0

    train_stream = torch.cuda.Stream()

    torch.cuda.synchronize()

    for update in iterator:

        T = args.world_size * update * num_frames_per_iter
        if (args.rank == 0) and (T >= evaluation_offset):
            evaluation_offset += args.evaluation_interval
            eval_lengths, eval_rewards = test(args, model, test_env)

            lmean, lmedian, lmin, lmax, lstd = gen_data(eval_lengths)
            rmean, rmedian, rmin, rmax, rstd = gen_data(eval_rewards)
            length_data = '(length) min/max/mean/median: {lmin:4.1f}/{lmax:4.1f}/{lmean:4.1f}/{lmedian:4.1f}'.format(lmin=lmin, lmax=lmax, lmean=lmean, lmedian=lmedian)
            reward_data = '(reward) min/max/mean/median: {rmin:4.1f}/{rmax:4.1f}/{rmean:4.1f}/{rmedian:4.1f}'.format(rmin=rmin, rmax=rmax, rmean=rmean, rmedian=rmedian)
            print('[training time: {}] {}'.format(format_time(total_time), ' --- '.join([length_data, reward_data])))

            if eval_csv_writer and eval_csv_file:
                eval_csv_writer.writerow([T, total_time, rmean, rmedian, rmin, rmax, rstd, lmean, lmedian, lmin, lmax, lstd])
                eval_csv_file.flush()

            if args.plot:
                summary_writer.add_scalar('eval/rewards_mean', rmean, T, walltime=total_time)
                summary_writer.add_scalar('eval/lengths_mean', lmean, T, walltime=total_time)

        start_time = time.time()

        with torch.no_grad():

            for step in range(args.num_steps):
                nvtx.range_push('train:step')
                value, logit = model(states[step])

                # store values and logits
                values[step], logits[step] = value.squeeze(-1), logit.squeeze(-1)

                # convert actions to numpy and perform next step
                probs = torch.clamp(F.softmax(logit, dim=1), min = 0.00001, max = 0.99999)
                probs_action = probs.multinomial(1).to(env_device)
                observation, reward, done, info = train_env.step(maybe_npy(probs_action))

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation)
                    reward = torch.from_numpy(reward)
                    done = torch.from_numpy(done.astype(np.uint8))
                else:
                    observation = observation.squeeze(-1).unsqueeze(1)

                # move back to training memory
                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device, dtype=torch.float32)
                done = done.to(device=train_device, dtype=torch.bool)
                probs_action = probs_action.to(device=train_device, dtype=torch.long)

                not_done = 1.0 - done.float()

                # update rewards and actions
                actions[step].copy_(probs_action.view(-1))
                masks[step].copy_(not_done)
                rewards[step].copy_(reward.sign())

                # update next observations
                states[step + 1, :, :-1].copy_(states[step, :, 1:])
                states[step + 1] *= not_done.view(-1, *[1] * (observation.dim() - 1))
                states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:]))

                # update episodic reward counters
                episode_rewards += reward
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done
                nvtx.range_pop()

            returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1)

            if args.use_gae:
                gae.zero_()
                for step in reversed(range(args.num_steps)):
                    delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step]
                    gae = delta + (args.gamma * args.tau * masks[step] * gae)
                    returns[step] = gae + values[step]
            else:
                for step in reversed(range(args.num_steps)):
                    returns[step] = rewards[step] + (args.gamma * returns[step + 1] * masks[step])

            log_probs = F.log_softmax(logits[:-1].view(-1, train_env.action_space.n), dim=1)
            action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1))
            advantages = returns[:-1].view(-1).unsqueeze(-1) - values[:-1].view(-1).unsqueeze(-1)
            advantages = (advantages - advantages.mean()) / (advantages.std() + float(np.finfo(np.float32).eps))

        total_value_loss = 0.0
        total_policy_loss = 0.0
        total_dist_entropy = 0.0

        nvtx.range_push('train:loader')
        states_view = states[:-1].view(-1, *states.size()[-3:])
        actions_view = actions.view(-1)
        returns_view = returns[:-1].view(-1)
        train_dataset = torch.utils.data.TensorDataset(states_view, actions_view, action_log_probs, returns_view, advantages)

        train_sampler = None
        if args.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)

        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
                                                   num_workers=0, pin_memory=False, sampler=train_sampler)
        nvtx.range_pop()

        with torch.cuda.stream(train_stream):
            for epoch in range(args.ppo_epoch):
                nvtx.range_push('train:epoch_step')

                if args.distributed:
                    train_sampler.set_epoch(epoch)

                prefetcher = data_prefetcher(train_loader)
                local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next()

                while local_states is not None:
                    batch_values, batch_logits = model(local_states)
                    batch_log_probs = F.log_softmax(batch_logits, dim=1)
                    batch_action_log_probs = batch_log_probs.gather(1, local_actions.unsqueeze(-1))

                    batch_probs = F.softmax(batch_logits, dim=1)
                    batch_dist_entropy = -(batch_log_probs * batch_probs).sum(-1).mean()

                    ratio = torch.exp(batch_action_log_probs - local_action_log_probs)
                    surrogate1 = ratio * local_advantages
                    surrogate2 = torch.clamp(ratio, 1.0 - args.clip_epsilon, 1.0 + args.clip_epsilon) * local_advantages
                    batch_policy_loss = -torch.min(surrogate1, surrogate2).mean()
                    batch_value_loss = F.mse_loss(local_returns.unsqueeze(-1), batch_values) / 2.0

                    loss = batch_value_loss * args.value_loss_coef + batch_policy_loss - batch_dist_entropy * args.entropy_coef
                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                    optimizer.step()

                    total_value_loss += batch_value_loss.item()
                    total_policy_loss += batch_policy_loss.item()
                    total_dist_entropy += batch_dist_entropy.item()

                    local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next()
                scheduler.step()
                nvtx.range_pop()

        torch.cuda.synchronize()

        states[0].copy_(states[-1])

        if args.rank == 0:
            iter_time = time.time() - start_time
            total_time += iter_time

            value_loss = total_value_loss / (args.ppo_epoch * args.num_minibatches)
            policy_loss = total_policy_loss / (args.ppo_epoch * args.num_minibatches)
            dist_entropy = total_dist_entropy / (args.ppo_epoch * args.num_minibatches)

            if args.plot:
                writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time)
                writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time)
                writer.add_scalar('train/learning_rate', scheduler.get_lr()[0], T, walltime=total_time)
                writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time)
                writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time)
                writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time)

            progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths,
                                     value_loss, policy_loss, dist_entropy, train_csv_writer, train_csv_file)
            iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
    if args.use_openai_test_env:
        test_env.close()
Esempio n. 19
0
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    nvtx.range_push("Data loading");
    for batch_idx, (data, target) in enumerate(train_loader):
        nvtx.range_pop();# Data loading
        nvtx.range_push("Batch " + str(batch_idx))

        nvtx.range_push("Copy to device")
        data, target = data.to(device), target.to(device)
        nvtx.range_pop() # Copy to device

        nvtx.range_push("Forward pass")
        optimizer.zero_grad()

        # Enables autocasting for the forward pass
        with torch.cuda.amp.autocast(enabled=True):
            output = model(data)
            loss = F.nll_loss(output, target)
        nvtx.range_pop() # Forward pass

        nvtx.range_push("Backward pass")
        loss.backward()
        optimizer.step()
        nvtx.range_pop() # Backward pass

        nvtx.range_pop() # Batch
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            if args.dry_run:
                break
        nvtx.range_push("Data loading");
    nvtx.range_pop(); # Data loading
Esempio n. 20
0
def main():
    # c10d_frontend = torch.classes.dist_c10d.frontend()

    dist.init_process_group(backend='nccl')
    d_pg = _get_default_group()
    # print(c10d_frontend.get_name_of_process_group(d_pg))

    pg2 = dist.new_group([0,1], backend='nccl')
    # print(c10d_frontend.get_name_of_process_group(pg2))
    if dist.get_rank() == 0:
        print(dir(d_pg))
        print(type(d_pg))
        print(type(pg2))
        print(dir(dist))
        print(_pg_names)

    local_size = torch.cuda.device_count()
    rank = dist.get_rank()
    torch.cuda.set_device(rank % local_size)
    torch.cuda.synchronize()
    comm_stream = torch.cuda.Stream(rank % local_size)
    device_id = rank % local_size
    # print(f'rank {rank}')
    warm_up = 5
    repeat = 10

    partition_sizes = [
        2457600,
        960,
        819200,
        320,
        320,
        320,
        3276800,
        1280,
        3276800,
        320,
        320,
        320
    ]

    local_params = []
    for psize in partition_sizes:
        r = torch.rand(psize, dtype=torch.half, device=f'cuda:{device_id}').view(-1)
        local_params.append(r)
        print(f'rank {rank}, psize {psize}, sum {torch.sum(r).item()}')

    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True) 
    ts = []
    for i in range(repeat + warm_up):
        with torch.cuda.stream(comm_stream):
            nvtx.range_push(f'exp-{i}')
            t1 = time.time()
            # start_event.record(stream=comm_stream)
            benchmark_all_gather(partition_sizes, local_params, comm_stream)
            # end_event.record(stream=comm_stream)
            # end_event.synchronize()

            t2 = time.time()
            nvtx.range_pop()

            if i >= warm_up:
                # ts.append(start_event.elapsed_time(end_event))
                ts.append((t2 - t1) * 1e3)
    
    if dist.get_rank() == 0:
        avg_t = np.mean(ts)
        bw = (dist.get_world_size() - 1) * np.sum(partition_sizes) * 2 / 1e9 / (avg_t / 1e3)
        print(f'avg time {avg_t} ms, bw {bw} GB/s')
Esempio n. 21
0
	def forward(self, x):
		identity = x

		nvtx.range_push("layer:{}".format(chr(self.id + 97))) # to print a,b,c,..

		nvtx.range_push("layer:Conv1")
		out = self.conv1(x)
		nvtx.range_pop()

		nvtx.range_push("layer:BN1")
		out = self.bn1(out)
		nvtx.range_pop()

		nvtx.range_push("layer:ReLU1")
		out = self.relu(out)
		nvtx.range_pop()

		nvtx.range_push("layer:Conv2")
		out = self.conv2(out)
		nvtx.range_pop()

		nvtx.range_push("layer:BN2")
		out = self.bn2(out)
		nvtx.range_pop()

		nvtx.range_push("layer:ReLU2")
		out = self.relu(out)
		nvtx.range_pop()

		nvtx.range_push("layer:Conv3")
		out = self.conv3(out)
		nvtx.range_pop()

		nvtx.range_push("layer:BN3")
		out = self.bn3(out)
		nvtx.range_pop()

		nvtx.range_push("layer:Residual")
		if self.downsample is not None:
			nvtx.range_push("layer:Projection")
			identity = self.downsample(x)
			nvtx.range_pop()

		out += identity
		nvtx.range_pop()

		nvtx.range_push("layer:ReLU3")
		out = self.relu(out)
		nvtx.range_pop()

		nvtx.range_pop()

		return out
    if args.pipeline:
        train_set = SoftwarePipeline(train_set)

    gpu_id = dist.get_rank() % torch.cuda.device_count()
    torch.cuda.set_device(gpu_id)
    model = models.__dict__["resnet50"]()
    model.cuda(torch.cuda.current_device())
    model = DDP(model, [gpu_id])
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(model.parameters(),
                          lr=init_learning_rate,
                          momentum=0.875,
                          weight_decay=3.0517578125e-05)

    for epoch in range(10):
        nvtx.range_push('epoch')

        nvtx.range_push('set_train')
        model.train()
        nvtx.range_pop()  # set train

        nvtx.range_push('set_epoch')
        train_sampler.set_epoch(epoch)
        nvtx.range_pop()  # set epoch

        nvtx.range_push('adjust_lr')
        adjust_learning_rate(optimizer, epoch, init_learning_rate)
        nvtx.range_pop()  # adjust lr

        time0 = pc()
Esempio n. 23
0
def benchmark_all_gather(partition_sizes, local_params, comm_stream):
    dtype = torch.half
    
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    device_id = rank % torch.cuda.device_count()

    t1 = time.time()
    with torch.cuda.stream(comm_stream):
        nvtx.range_push('allocate final params')
        # allocate memories
        allgather_params = []
        for psize in partition_sizes:
            tensor_size = psize * world_size
            tensor = torch.empty(tensor_size, dtype=dtype, device=f'cuda:{device_id}').view(-1)
            allgather_params.append(tensor)

        nvtx.range_pop()
    comm_stream.synchronize()
    t2 = time.time()
    # print_at_rank0(f'allocate cost {t2 - t1} s')

    with torch.cuda.stream(comm_stream):
        nvtx.range_push('construct all output list')
        # create allgather parameters
        all_gather_list_list = []
        for pidx, psize in enumerate(partition_sizes):
            flat_tensor = allgather_params[pidx]
            partitions = []
            for i in range(world_size):
                partitions.append(flat_tensor.narrow(0, psize * i, psize))

            all_gather_list_list.append(partitions)

        nvtx.range_pop()

    comm_stream.synchronize()
    print_at_rank0(f'construct params cost {time.time() - t2} s')

    with torch.cuda.stream(comm_stream):
        backend = get_backend()
        nvtx.range_push('launch dist all-gather')

        with _batch_p2p_manager(backend):
            handles = []    
            for pidx, psize in enumerate(partition_sizes):
                h = all_gather(all_gather_list_list[pidx], 
                                all_gather_list_list[pidx][rank], 
                                async_op=True)
                # h = dist.all_gather(all_gather_list_list[pidx], 
                #                 all_gather_list_list[pidx][rank], 
                #                 async_op=True)

                handles.append(h)
        
        # handles=[]
        # for pidx, psize in enumerate(partition_sizes):
        #     # h = all_gather(all_gather_list_list[pidx], 
        #     #                 all_gather_list_list[pidx][rank], 
        #     #                 async_op=True)
        #     h = dist.all_gather(all_gather_list_list[pidx], 
        #                     local_params[pidx], 
        #                     async_op=True)
        #     handles.append(h)
        #     # torch.cuda.synchronize()

        # handles[-1].wait() # event enqueued, but not guaranteed complete
        nvtx.range_pop()

    torch.cuda.synchronize()
    end_event = torch.cuda.Event()
    comm_stream.wait_event(end_event)
    return None
Esempio n. 24
0
    def learn(self, states, actions, returns, next_states, nonterminals,
              weights):

        tactions = actions.unsqueeze(-1).unsqueeze(-1)
        if self.categorical:
            tactions = tactions.expand(-1, -1, self.atoms)

        # Calculate current state probabilities (online network noise already sampled)
        nvtx.range_push('agent:online (state) probs')
        ps = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        ps_a = ps.gather(1, tactions)  # log p(s_t, a_t; θonline)
        nvtx.range_pop()

        with torch.no_grad():
            if isinstance(self.target_net, DQN):
                self.target_net.reset_noise()
            else:
                self.target_net.module.reset_noise(
                )  # Sample new target net noise

            nvtx.range_push('agent:target (next state) probs')
            tns = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            nvtx.range_pop()

            if self.double_q:
                # Calculate nth next state probabilities
                nvtx.range_push('agent:online (next state) probs')
                pns = self.online_net(
                    next_states)  # Probabilities p(s_t+n, ·; θonline)
                nvtx.range_pop()
            else:
                pns = tns

            if self.categorical:
                pns = self.support.expand_as(
                    pns
                ) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))

            # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            argmax_indices_ns = pns.sum(-1).argmax(-1).unsqueeze(-1).unsqueeze(
                -1)
            if self.categorical:
                argmax_indices_ns = argmax_indices_ns.expand(
                    -1, -1, self.atoms)
            pns_a = tns.gather(
                1, argmax_indices_ns
            )  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            if self.categorical:
                # Compute Tz (Bellman operator T applied to z)
                # Tz = R^n + (γ^n)z (accounting for terminal states)
                Tz = returns.unsqueeze(-1) + nonterminals.float().unsqueeze(
                    -1) * (self.discount**self.n) * self.support.unsqueeze(0)
                Tz = Tz.clamp(min=self.v_min,
                              max=self.v_max)  # Clamp between supported values
                # Compute L2 projection of Tz onto fixed support z
                b = (Tz - self.v_min) / self.delta_z  # b = (Tz - Vmin) / Δz
                l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
                # Fix disappearing probability mass when l = b = u (b is int)
                l[(u > 0) * (l == u)] -= 1
                u[(l < (self.atoms - 1)) * (l == u)] += 1

                # Distribute probability of Tz
                batch_size = states.size(0)
                m = states.new_zeros(batch_size, self.atoms)
                offset = torch.linspace(0, ((batch_size - 1) * self.atoms),
                                        batch_size).unsqueeze(1).expand(
                                            batch_size, self.atoms).to(actions)
                m.view(-1).index_add_(
                    0, (l + offset).view(-1),
                    (pns_a.squeeze(1) * (u.float() - b)
                     ).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
                m.view(-1).index_add_(
                    0, (u + offset).view(-1),
                    (pns_a.squeeze(1) * (b - l.float())
                     ).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)
            else:
                Tz = returns + nonterminals.float() * (
                    self.discount**self.n) * pns_a.squeeze(-1).squeeze(-1)

        if self.categorical:
            loss = -torch.sum(
                m * ps_a.squeeze(1),
                1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
            weights = weights.unsqueeze(-1)
        else:
            loss = F.mse_loss(ps_a.squeeze(-1).squeeze(-1),
                              Tz,
                              reduction='none')

        nvtx.range_push('agent:loss + step')
        self.optimizer.zero_grad()
        weighted_loss = (weights * loss).mean()
        with amp.scale_loss(weighted_loss, self.optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer),
                                       self.max_grad_norm)
        self.optimizer.step()
        nvtx.range_pop()

        return loss.detach()
Esempio n. 25
0
    def wrapper_func(*args, **kwargs):

        global wrappers_enabled
        traceMarker_str = ""
        input_callid_list = []

        if wrappers_enabled:

            if config.capture_input_ops:
                ## Stack for callids to work with nested monkey patch function calls
                dlprof.patch_list.append(dlprof.call_id)
                dlprof.capture_inputs(dlprof.call_id, input_callid_list, *args)

            # Push trace marker
            traceMarker_str = traceMarker(fn_name)
            nvtx.range_push(traceMarker_str)

            # Push module marker
            if s:
                m = modMarker(mod, fn_name, args)
                nvtx.range_push(m)

            # Create and push argument marker
            #
            # Disable wrappers while getting the argMarker in case it
            # ends up executing another wrapped function
            wrappers_enabled = False
            if config.capture_input_ops:
                saved_call_id = dlprof.call_id
                # Keeps call_id correct when there are nested
                # monkey patch functions
                if dlprof.call_id != dlprof.patch_list[0]:
                    saved_call_id = dlprof.patch_list[0]
                cadena = argMarker(mod, fn_name, args, kwargs, saved_call_id,
                                   input_callid_list)
            else:
                cadena = argMarker(mod, fn_name, args, kwargs)
            nvtx.range_push(cadena)
            wrappers_enabled = True

        # Call the original function
        result = func(*args, **kwargs)

        if wrappers_enabled:
            # Pop argumet marker
            nvtx.range_pop()

            # Pop module marker
            if s:
                nvtx.range_pop()

            # Pop trace marker
            nvtx.range_pop()

            if config.capture_input_ops:
                # Keeps call_id correct when there are nested
                # monkey patch functions
                saved_call_id = dlprof.call_id
                if dlprof.call_id != dlprof.patch_list[0]:
                    saved_call_id = dlprof.patch_list[0]
                dlprof.capture_outputs(saved_call_id, result)
                # Store the callid -> op_name mapping
                if traceMarker_str != "":
                    traceMarker_str = traceMarker_str.replace("\'", "\"")
                    traceMarker_dict = json.loads(traceMarker_str)
                    dlprof.call_id_to_op_map[saved_call_id] = traceMarker_dict[
                        'funcStack']

                starting_call_id = dlprof.patch_list[0]
                last_call_id = dlprof.patch_list.pop()
                dlprof.call_id = dlprof.call_id + 1
        return result
Esempio n. 26
0
def worker(gpu, ngpus_per_node, callback, args):
    args.gpu = gpu

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(
            backend='nccl',
            init_method='tcp://127.0.0.1:8632',
            world_size=args.world_size,
            rank=args.rank)
    else:
        args.rank = 0

    if (args.num_ales % args.num_minibatches) != 0:
        raise ValueError(
            'Number of ales({}) size is not even divisible by the minibatch size({})'
            .format(args.num_ales, args.num_minibatches))

    if args.num_steps_per_update == -1:
        args.num_steps_per_update = args.num_steps

    minibatch_size = int(args.num_ales / args.num_minibatches)
    step0 = args.num_steps - args.num_steps_per_update
    n_minibatch = -1

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = (not args.no_cuda_train) and torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    env_device = torch.device(
        'cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (
        args.no_cuda_train == False) else torch.device('cpu')

    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(np.random.randint(1, 10000))

    if args.rank == 0:
        if args.output_filename:
            train_csv_file = open(args.output_filename, 'w', newline='')
            train_csv_file.write(json.dumps(vars(args)))
            train_csv_file.write('\n')
            train_csv_writer = csv.writer(train_csv_file, delimiter=',')
            train_csv_writer.writerow([
                'frames', 'fps', 'total_time', 'rmean', 'rmedian', 'rmin',
                'rmax', 'lmean', 'lmedian', 'lmin', 'lmax', 'entropy',
                'value_loss', 'policy_loss'
            ])

            eval_output_filename = '.'.join([
                ''.join(args.output_filename.split('.')[:-1] + ['_test']),
                'csv'
            ])
            eval_csv_file = open(eval_output_filename, 'w', newline='')
            eval_csv_file.write(json.dumps(vars(args)))
            eval_csv_file.write('\n')
            eval_csv_writer = csv.writer(eval_csv_file, delimiter=',')
            eval_csv_writer.writerow([
                'frames', 'total_time', 'rmean', 'rmedian', 'rmin', 'rmax',
                'rstd', 'lmean', 'lmedian', 'lmin', 'lmax', 'lstd'
            ])
        else:
            train_csv_file, train_csv_writer = None, None
            eval_csv_file, eval_csv_writer = None, None

        if args.plot:
            from tensorboardX import SummaryWriter
            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
            log_dir = os.path.join(args.log_dir,
                                   current_time + '_' + socket.gethostname())
            writer = SummaryWriter(log_dir=log_dir)
            for k, v in vars(args).items():
                writer.add_text(k, str(v))

        print()
        print('PyTorch  : {}'.format(torch.__version__))
        print('CUDA     : {}'.format(torch.backends.cudnn.m.cuda))
        print('CUDNN    : {}'.format(torch.backends.cudnn.version()))
        print('APEX     : {}'.format('.'.join(
            [str(i) for i in apex.amp.__version__.VERSION])))
        print()

    if train_device.type == 'cuda':
        print(cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        train_env = create_vectorize_atari_env(
            args.env_name,
            args.seed,
            args.num_ales,
            episode_life=args.episodic_life,
            clip_rewards=False,
            max_frames=args.max_episode_length)
        observation = torch.from_numpy(train_env.reset()).squeeze(1)
    else:
        train_env = AtariEnv(args.env_name,
                             args.num_ales,
                             color_mode='gray',
                             repeat_prob=0.0,
                             device=env_device,
                             rescale=True,
                             episodic_life=args.episodic_life,
                             clip_rewards=False,
                             frameskip=4)
        train_env.train()
        observation = train_env.reset(initial_steps=args.ale_start_steps,
                                      verbose=args.verbose).squeeze(-1)

    if args.use_openai_test_env:
        test_env = create_vectorize_atari_env(args.env_name,
                                              args.seed,
                                              args.evaluation_episodes,
                                              episode_life=False,
                                              clip_rewards=False)
        test_env.reset()
    else:
        test_env = AtariEnv(args.env_name,
                            args.evaluation_episodes,
                            color_mode='gray',
                            repeat_prob=0.0,
                            device='cpu',
                            rescale=True,
                            episodic_life=False,
                            clip_rewards=False,
                            frameskip=4)

    model = ActorCritic(args.num_stack,
                        train_env.action_space,
                        normalize=args.normalize,
                        name=args.env_name)
    model = model.to(train_device).train()

    if args.rank == 0:
        print(model)
        args.model_name = model.name

    if args.use_adam:
        optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
    else:
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)

    # This is the number of frames GENERATED between two updates
    num_frames_per_iter = args.num_ales * args.num_steps_per_update
    total_steps = math.ceil(args.t_max /
                            (args.world_size * num_frames_per_iter))
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=args.opt_level,
                                      loss_scale=args.loss_scale)

    if args.distributed:
        model = DDP(model, delay_allreduce=True)

    shape = (args.num_steps + 1, args.num_ales, args.num_stack,
             *train_env.observation_space.shape[-2:])
    states = torch.zeros(shape, device=train_device, dtype=torch.float32)
    states[step0, :, -1] = observation.to(device=train_device,
                                          dtype=torch.float32)

    shape = (args.num_steps + 1, args.num_ales)
    values = torch.zeros(shape, device=train_device, dtype=torch.float32)
    logits = torch.zeros(
        (args.num_steps + 1, args.num_ales, train_env.action_space.n),
        device=train_device,
        dtype=torch.float32)
    returns = torch.zeros(shape, device=train_device, dtype=torch.float32)

    shape = (args.num_steps, args.num_ales)
    rewards = torch.zeros(shape, device=train_device, dtype=torch.float32)
    masks = torch.zeros(shape, device=train_device, dtype=torch.float32)
    actions = torch.zeros(shape, device=train_device, dtype=torch.long)

    mus = torch.ones(shape, device=train_device, dtype=torch.float32)
    # pis = torch.zeros(shape, device=train_device, dtype=torch.float32)
    rhos = torch.zeros((args.num_steps, minibatch_size),
                       device=train_device,
                       dtype=torch.float32)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_rewards = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)
    episode_lengths = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_lengths = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)

    if args.use_gae:
        raise ValueError('GAE is not compatible with VTRACE')

    maybe_npy = lambda a: a.numpy() if args.use_openai else a

    torch.cuda.synchronize()

    iterator = range(total_steps)
    if args.rank == 0:
        iterator = tqdm(iterator)
        total_time = 0
        evaluation_offset = 0

    for update in iterator:

        T = args.world_size * update * num_frames_per_iter
        if (args.rank == 0) and (T >= evaluation_offset):
            evaluation_offset += args.evaluation_interval
            eval_lengths, eval_rewards = evaluate(args, T, total_time, model,
                                                  test_env, eval_csv_writer,
                                                  eval_csv_file)

            if args.plot:
                writer.add_scalar('eval/rewards_mean',
                                  eval_rewards.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('eval/lengths_mean',
                                  eval_lengths.mean().item(),
                                  T,
                                  walltime=total_time)

        start_time = time.time()

        with torch.no_grad():

            for step in range(args.num_steps_per_update):
                nvtx.range_push('train:step')
                value, logit = model(states[step0 + step])

                # store values and logits
                values[step0 + step] = value.squeeze(-1)

                # convert actions to numpy and perform next step
                probs = torch.clamp(F.softmax(logit, dim=1),
                                    min=0.00001,
                                    max=0.99999)
                probs_action = probs.multinomial(1).to(env_device)
                # Check if the multinomial threw an exception
                # https://github.com/pytorch/pytorch/issues/7014
                torch.cuda.current_stream().synchronize()
                observation, reward, done, info = train_env.step(
                    maybe_npy(probs_action))

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation)
                    reward = torch.from_numpy(reward)
                    done = torch.from_numpy(done.astype(np.uint8))
                else:
                    observation = observation.squeeze(-1).unsqueeze(1)

                # move back to training memory
                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device, dtype=torch.float32)
                done = done.to(device=train_device)
                probs_action = probs_action.to(device=train_device,
                                               dtype=torch.long)

                not_done = 1.0 - done.float()

                # update rewards and actions
                actions[step0 + step].copy_(probs_action.view(-1))
                masks[step0 + step].copy_(not_done)
                rewards[step0 + step].copy_(reward.sign())

                #mus[step0 + step] = F.softmax(logit, dim=1).gather(1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1)
                mus[step0 + step] = torch.clamp(F.softmax(logit, dim=1).gather(
                    1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1),
                                                min=0.00001,
                                                max=0.99999)

                # update next observations
                states[step0 + step + 1, :, :-1].copy_(states[step0 + step, :,
                                                              1:])
                states[step0 + step + 1] *= not_done.view(
                    -1, *[1] * (observation.dim() - 1))
                states[step0 + step + 1, :,
                       -1].copy_(observation.view(-1,
                                                  *states.size()[-2:]))

                # update episodic reward counters
                episode_rewards += reward
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done
                nvtx.range_pop()

        n_minibatch = (n_minibatch + 1) % args.num_minibatches
        min_ale_index = int(n_minibatch * minibatch_size)
        max_ale_index = min_ale_index + minibatch_size

        # compute v-trace using the recursive method (remark 1 in IMPALA paper)
        # value_next_step, logit = model(states[-1:, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:]))
        # returns[-1, min_ale_index:max_ale_index] = value_next_step.squeeze()
        # for step in reversed(range(args.num_steps)):
        #     value, logit = model(states[step, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:]))
        #     pis = F.softmax(logit, dim=1).gather(1, actions[step, min_ale_index:max_ale_index].view(-1).unsqueeze(-1)).view(-1)
        #     c = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=c_)
        #     rhos[step, :] = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=rho_)
        #     delta_value = rhos[step, :] * (rewards[step, min_ale_index:max_ale_index] + (args.gamma * value_next_step - value).squeeze())
        #     returns[step, min_ale_index:max_ale_index] = value.squeeze() + delta_value + args.gamma * c * \
        #             (returns[step + 1, min_ale_index:max_ale_index] - value_next_step.squeeze())
        #     value_next_step = value

        nvtx.range_push('train:compute_values')
        value, logit = model(
            states[:, min_ale_index:max_ale_index, :, :, :].contiguous().view(
                -1,
                *states.size()[-3:]))
        batch_value = value.detach().view((args.num_steps + 1, minibatch_size))
        batch_probs = F.softmax(logit.detach()[:(args.num_steps *
                                                 minibatch_size), :],
                                dim=1)
        batch_pis = batch_probs.gather(
            1, actions[:, min_ale_index:max_ale_index].contiguous().view(
                -1).unsqueeze(-1)).view((args.num_steps, minibatch_size))
        returns[-1, min_ale_index:max_ale_index] = batch_value[-1]

        with torch.no_grad():
            for step in reversed(range(args.num_steps)):
                c = torch.clamp(batch_pis[step, :] /
                                mus[step, min_ale_index:max_ale_index],
                                max=args.c_hat)
                rhos[step, :] = torch.clamp(
                    batch_pis[step, :] /
                    mus[step, min_ale_index:max_ale_index],
                    max=args.rho_hat)
                delta_value = rhos[step, :] * (
                    rewards[step, min_ale_index:max_ale_index] +
                    (args.gamma * batch_value[step + 1] -
                     batch_value[step]).squeeze())
                returns[step, min_ale_index:max_ale_index] = \
                        batch_value[step, :].squeeze() + delta_value + args.gamma * c * \
                        (returns[step + 1, min_ale_index:max_ale_index] - batch_value[step + 1, :].squeeze())

        value = value[:args.num_steps * minibatch_size, :]
        logit = logit[:args.num_steps * minibatch_size, :]

        log_probs = F.log_softmax(logit, dim=1)
        probs = F.softmax(logit, dim=1)

        action_log_probs = log_probs.gather(
            1, actions[:, min_ale_index:max_ale_index].contiguous().view(
                -1).unsqueeze(-1))
        dist_entropy = -(log_probs * probs).sum(-1).mean()

        advantages = returns[:-1, min_ale_index:max_ale_index].contiguous(
        ).view(-1).unsqueeze(-1) - value

        value_loss = advantages.pow(2).mean()
        policy_loss = -(action_log_probs * rhos.view(-1, 1).detach() * \
                (rewards[:, min_ale_index:max_ale_index].contiguous().view(-1, 1) + args.gamma * \
                returns[1:, min_ale_index:max_ale_index].contiguous().view(-1, 1) - value).detach()).mean()
        nvtx.range_pop()

        nvtx.range_push('train:backprop')
        loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef
        optimizer.zero_grad()
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                       args.max_grad_norm)
        optimizer.step()
        nvtx.range_pop()

        nvtx.range_push('train:next_states')
        for step in range(0, args.num_steps_per_update):
            states[:-1, :, :, :, :] = states[1:, :, :, :, :]
            rewards[:-1, :] = rewards[1:, :]
            actions[:-1, :] = actions[1:, :]
            masks[:-1, :] = masks[1:, :]
            mus[:-1, :] = mus[1:, :]
        nvtx.range_pop()

        torch.cuda.synchronize()

        if args.rank == 0:
            iter_time = time.time() - start_time
            total_time += iter_time

            if args.plot:
                writer.add_scalar('train/rewards_mean',
                                  final_rewards.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/lengths_mean',
                                  final_lengths.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/value_loss',
                                  value_loss,
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/policy_loss',
                                  policy_loss,
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/entropy',
                                  dist_entropy,
                                  T,
                                  walltime=total_time)

            progress_data = callback(args, model, T, iter_time, final_rewards,
                                     final_lengths, value_loss, policy_loss,
                                     dist_entropy, train_csv_writer,
                                     train_csv_file)
            iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
    if args.use_openai_test_env:
        test_env.close()
Esempio n. 27
0
        # self.input = ops.FileReader(file_root = image_dir, file_list=image_dir+"/file_list.txt")
        # self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)

    def define_graph(self):
        nvtx.range_push("Reading JPEG files into host memory")
        jpegs, labels = self.input()  # read in jpeg files
        nvtx.range_pop()
        nvtx.range_push("Start mixed decoding process")
        # images = self.decode(jpegs) # Do decoding process
        decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
        images = decode(jpegs)
        nvtx.range_pop()
        return (images, labels)


if __name__ == "__main__":
    pipe = SimplePipeline(batch_size, 1, 0)
    nvtx.range_push("Building pipeline")
    pipe.build()
    nvtx.range_pop()
    nvtx.range_push("Running pipeline")
    ticks = time.time()
    pipe_out = pipe.run()
    images, labels = pipe_out
    nvtx.range_pop()
    # images_cpu = images.as_cpu()
    elapsed = time.time() - ticks
    print("Time elapsed for getting decoded images: ", elapsed)
    # showImages(images)
    # printDirHierarchy(image_dir)
Esempio n. 28
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=100, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    train_kwargs = {'batch_size': args.batch_size}
    test_kwargs = {'batch_size': args.test_batch_size}
    if use_cuda:
        cuda_kwargs = {'num_workers': multiprocessing.cpu_count(),
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)

    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    dataDir = os.path.join(scriptPath, 'data')
    dataset1 = datasets.MNIST(dataDir, train=True, download=True,
                       transform=transform)
    dataset2 = datasets.MNIST(dataDir, train=False,
                       transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        # Start profiling from 2nd epoch
        if epoch == 2:
            torch.cuda.cudart().cudaProfilerStart()

        nvtx.range_push("Epoch " + str(epoch))
        nvtx.range_push("Train")
        train(args, model, device, train_loader, optimizer, epoch)
        nvtx.range_pop() # Train

        nvtx.range_push("Test")
        test(model, device, test_loader)
        nvtx.range_pop() # Test

        scheduler.step()
        nvtx.range_pop() # Epoch
        # Stop profiling at the end of 2nd epoch
        if epoch == 2:
            torch.cuda.cudart().cudaProfilerStop()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Esempio n. 29
0
def worker(gpu, ngpus_per_node, callback, args):
    args.gpu = gpu

    if (args.num_ales % args.world_size) != 0:
        raise ValueError(
            'The num_ales({}) should be evenly divisible by the world_size({})'
            .format(args.num_ales, args.world_size))
    args.num_ales = int(args.num_ales / args.world_size)

    if (args.batch_size % args.world_size) != 0:
        raise ValueError(
            'The batch_size({}) should be evenly divisible by the world_size({})'
            .format(args.batch_size, args.world_size))
    args.batch_size = int(args.num_ales / args.world_size)

    num_frames_per_iter = args.num_ales * args.num_steps
    args.num_minibatches = num_frames_per_iter / args.batch_size
    total_steps = math.ceil(args.t_max /
                            (args.world_size * num_frames_per_iter))

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(
            backend='nccl',
            init_method='tcp://127.0.0.1:8632',
            world_size=args.world_size,
            rank=args.rank)
    else:
        args.rank = 0

    if args.lr_scale:
        scaled_lr = args.lr * math.sqrt((args.num_ales * args.world_size) / 16)
        if args.rank == 0:
            print('Scaled learning rate from {:4.4f} to {:4.4f}'.format(
                args.lr, scaled_lr))
        args.lr = scaled_lr

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = (not args.no_cuda_train) and torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    env_device = torch.device(
        'cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (
        args.no_cuda_train == False) else torch.device('cpu')

    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(np.random.randint(1, 10000))

    if args.rank == 0:
        if args.output_filename:
            train_csv_file = open(args.output_filename, 'w', newline='')
            train_csv_file.write(json.dumps(vars(args)))
            train_csv_file.write('\n')
            train_csv_writer = csv.writer(train_csv_file, delimiter=',')
            train_csv_writer.writerow([
                'frames', 'fps', 'total_time', 'rmean', 'rmedian', 'rmin',
                'rmax', 'lmean', 'lmedian', 'lmin', 'lmax', 'entropy',
                'value_loss', 'policy_loss'
            ])

            eval_output_filename = '.'.join([
                ''.join(args.output_filename.split('.')[:-1] + ['_test']),
                'csv'
            ])
            eval_csv_file = open(eval_output_filename, 'w', newline='')
            eval_csv_file.write(json.dumps(vars(args)))
            eval_csv_file.write('\n')
            eval_csv_writer = csv.writer(eval_csv_file, delimiter=',')
            eval_csv_writer.writerow([
                'frames', 'total_time', 'rmean', 'rmedian', 'rmin', 'rmax',
                'rstd', 'lmean', 'lmedian', 'lmin', 'lmax', 'lstd'
            ])
        else:
            train_csv_file, train_csv_writer = None, None
            eval_csv_file, eval_csv_writer = None, None

        if args.plot:
            from tensorboardX import SummaryWriter
            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
            log_dir = os.path.join(args.log_dir,
                                   current_time + '_' + socket.gethostname())
            writer = SummaryWriter(log_dir=log_dir)
            for k, v in vars(args).items():
                writer.add_text(k, str(v))

        print()
        print('PyTorch  : {}'.format(torch.__version__))
        print('CUDA     : {}'.format(torch.backends.cudnn.m.cuda))
        print('CUDNN    : {}'.format(torch.backends.cudnn.version()))
        print('APEX     : {}'.format('.'.join(
            [str(i) for i in apex.amp.__version__.VERSION])))
        print()

    if train_device.type == 'cuda':
        print(cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        train_env = create_vectorize_atari_env(
            args.env_name,
            args.seed,
            args.num_ales,
            episode_life=args.episodic_life,
            clip_rewards=False,
            max_frames=args.max_episode_length)
        observation = torch.from_numpy(train_env.reset()).squeeze(1)

        test_env = create_vectorize_atari_env(args.env_name,
                                              args.seed,
                                              args.evaluation_episodes,
                                              episode_life=False,
                                              clip_rewards=False)
        test_env.reset()
    else:
        train_env = AtariEnv(args.env_name,
                             args.num_ales,
                             color_mode='gray',
                             repeat_prob=0.0,
                             device=env_device,
                             rescale=True,
                             episodic_life=args.episodic_life,
                             clip_rewards=False)
        train_env.train()
        observation = train_env.reset(initial_steps=args.ale_start_steps,
                                      verbose=args.verbose).squeeze(-1)

        test_env = AtariEnv(args.env_name,
                            args.evaluation_episodes,
                            color_mode='gray',
                            repeat_prob=0.0,
                            device='cpu',
                            rescale=True,
                            episodic_life=False,
                            clip_rewards=False,
                            frameskip=4)

    model = ActorCritic(args.num_stack,
                        train_env.action_space,
                        normalize=args.normalize,
                        name=args.env_name)
    model = model.to(train_device).train()

    if args.rank == 0:
        print(model)
        args.model_name = model.name

    if args.use_adam:
        optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
    else:
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)

    decay = 1.0 / total_steps
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=args.ppo_epoch,
                                          gamma=1.0 - decay)

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=args.opt_level,
                                      loss_scale=args.loss_scale)

    if args.distributed:
        model = DDP(model, delay_allreduce=True)

    shape = (args.num_steps + 1, args.num_ales, args.num_stack,
             *train_env.observation_space.shape[-2:])
    states = torch.zeros(shape, device=train_device, dtype=torch.float32)
    states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32)

    shape = (args.num_steps + 1, args.num_ales)
    values = torch.zeros(shape, device=train_device, dtype=torch.float32)
    logits = torch.zeros(
        (args.num_steps + 1, args.num_ales, train_env.action_space.n),
        device=train_device,
        dtype=torch.float32)
    returns = torch.zeros(shape, device=train_device, dtype=torch.float32)

    shape = (args.num_steps, args.num_ales)
    rewards = torch.zeros(shape, device=train_device, dtype=torch.float32)
    masks = torch.zeros(shape, device=train_device, dtype=torch.float32)
    actions = torch.zeros(shape, device=train_device, dtype=torch.long)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_rewards = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)
    episode_lengths = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_lengths = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)

    if args.use_gae:
        gae = torch.zeros(args.num_ales,
                          device=train_device,
                          dtype=torch.float32)

    maybe_npy = lambda a: a.numpy() if args.use_openai else a

    torch.cuda.synchronize()

    iterator = range(total_steps)
    if args.rank == 0:
        iterator = tqdm(iterator)
        total_time = 0
        evaluation_offset = 0

    train_stream = torch.cuda.Stream()

    for update in iterator:

        T = args.world_size * update * num_frames_per_iter
        if (args.rank == 0) and (T >= evaluation_offset):
            evaluation_offset += args.evaluation_interval
            eval_lengths, eval_rewards = evaluate(args, T, total_time, model,
                                                  test_env, eval_csv_writer,
                                                  eval_csv_file)

            if args.plot:
                writer.add_scalar('eval/rewards_mean',
                                  eval_rewards.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('eval/lengths_mean',
                                  eval_lengths.mean().item(),
                                  T,
                                  walltime=total_time)

        start_time = time.time()

        with torch.no_grad():

            for step in range(args.num_steps):
                nvtx.range_push('train:step')
                value, logit = model(states[step])

                # store values and logits
                values[step], logits[step] = value.squeeze(-1), logit.squeeze(
                    -1)

                # convert actions to numpy and perform next step
                probs = torch.clamp(F.softmax(logit, dim=1),
                                    min=0.00001,
                                    max=0.99999)
                probs_action = probs.multinomial(1).to(env_device)
                observation, reward, done, info = train_env.step(
                    maybe_npy(probs_action))

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation)
                    reward = torch.from_numpy(reward)
                    done = torch.from_numpy(done.astype(np.uint8))
                else:
                    observation = observation.squeeze(-1).unsqueeze(1)

                # move back to training memory
                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device, dtype=torch.float32)
                done = done.to(device=train_device)
                probs_action = probs_action.to(device=train_device,
                                               dtype=torch.long)

                not_done = 1.0 - done.float()

                # update rewards and actions
                actions[step].copy_(probs_action.view(-1))
                masks[step].copy_(not_done)
                rewards[step].copy_(reward.sign())

                # update next observations
                states[step + 1, :, :-1].copy_(states[step, :, 1:])
                states[step + 1] *= not_done.view(
                    -1, *[1] * (observation.dim() - 1))
                states[step + 1, :,
                       -1].copy_(observation.view(-1,
                                                  *states.size()[-2:]))

                # update episodic reward counters
                episode_rewards += reward
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done
                nvtx.range_pop()

            returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1)

            if args.use_gae:
                gae.zero_()
                for step in reversed(range(args.num_steps)):
                    delta = rewards[step] + (args.gamma * values[step + 1] *
                                             masks[step]) - values[step]
                    gae = delta + (args.gamma * args.tau * masks[step] * gae)
                    returns[step] = gae + values[step]
            else:
                for step in reversed(range(args.num_steps)):
                    returns[step] = rewards[step] + (
                        args.gamma * returns[step + 1] * masks[step])

            log_probs = F.log_softmax(logits[:-1].view(
                -1, train_env.action_space.n),
                                      dim=1)
            action_log_probs = log_probs.gather(1,
                                                actions.view(-1).unsqueeze(-1))
            advantages = returns[:-1].view(-1).unsqueeze(
                -1) - values[:-1].view(-1).unsqueeze(-1)
            advantages = (advantages - advantages.mean()) / (
                advantages.std() + float(np.finfo(np.float32).eps))

        total_value_loss = 0.0
        total_policy_loss = 0.0
        total_dist_entropy = 0.0

        nvtx.range_push('train:loader')
        states_view = states[:-1].view(-1, *states.size()[-3:])
        actions_view = actions.view(-1)
        returns_view = returns[:-1].view(-1)
        train_dataset = torch.utils.data.TensorDataset(states_view,
                                                       actions_view,
                                                       action_log_probs,
                                                       returns_view,
                                                       advantages)

        train_sampler = None
        if args.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_dataset)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=(train_sampler is None),
            num_workers=0,
            pin_memory=False,
            sampler=train_sampler)
        nvtx.range_pop()

        with torch.cuda.stream(train_stream):
            for epoch in range(args.ppo_epoch):
                nvtx.range_push('train:epoch_step')

                if args.distributed:
                    train_sampler.set_epoch(epoch)

                prefetcher = data_prefetcher(train_loader)
                local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next(
                )

                while local_states is not None:
                    batch_values, batch_logits = model(local_states)
                    batch_log_probs = F.log_softmax(batch_logits, dim=1)
                    batch_action_log_probs = batch_log_probs.gather(
                        1, local_actions.unsqueeze(-1))

                    batch_probs = F.softmax(batch_logits, dim=1)
                    batch_dist_entropy = -(batch_log_probs *
                                           batch_probs).sum(-1).mean()

                    ratio = torch.exp(batch_action_log_probs -
                                      local_action_log_probs)
                    surrogate1 = ratio * local_advantages
                    surrogate2 = torch.clamp(
                        ratio, 1.0 - args.clip_epsilon,
                        1.0 + args.clip_epsilon) * local_advantages
                    batch_policy_loss = -torch.min(surrogate1,
                                                   surrogate2).mean()
                    batch_value_loss = F.mse_loss(local_returns.unsqueeze(-1),
                                                  batch_values) / 2.0

                    loss = batch_value_loss * args.value_loss_coef + batch_policy_loss - batch_dist_entropy * args.entropy_coef
                    optimizer.zero_grad()
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                    optimizer.step()

                    total_value_loss += batch_value_loss.item()
                    total_policy_loss += batch_policy_loss.item()
                    total_dist_entropy += batch_dist_entropy.item()

                    local_states, local_actions, local_action_log_probs, local_returns, local_advantages = prefetcher.next(
                    )
                scheduler.step()
                nvtx.range_pop()

        torch.cuda.synchronize()

        states[0].copy_(states[-1])

        if args.rank == 0:
            iter_time = time.time() - start_time
            total_time += iter_time

            value_loss = total_value_loss / (args.ppo_epoch *
                                             args.num_minibatches)
            policy_loss = total_policy_loss / (args.ppo_epoch *
                                               args.num_minibatches)
            dist_entropy = total_dist_entropy / (args.ppo_epoch *
                                                 args.num_minibatches)

            if args.plot:
                writer.add_scalar('train/rewards_mean',
                                  final_rewards.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/lengths_mean',
                                  final_lengths.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/learning_rate',
                                  scheduler.get_lr()[0],
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/value_loss',
                                  value_loss,
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/policy_loss',
                                  policy_loss,
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/entropy',
                                  dist_entropy,
                                  T,
                                  walltime=total_time)

            progress_data = callback(args, model, T, iter_time, final_rewards,
                                     final_lengths, value_loss, policy_loss,
                                     dist_entropy, train_csv_writer,
                                     train_csv_file)
            iterator.set_postfix_str(progress_data)

    if args.plot:
        writer.close()

    if args.use_openai:
        train_env.close()
        test_env.close()
Esempio n. 30
0
	def forward(self, x):

		nvtx.range_push("layer:block_1")
		x = self.conv1(x)
		x = self.bn1(x)
		x = self.relu(x)
		x = self.maxpool(x)
		nvtx.range_pop()

		nvtx.range_push("layer:block_2")
		x = self.layer1(x)
		nvtx.range_pop()

		nvtx.range_push("layer:block_3")
		x = self.layer2(x)
		nvtx.range_pop()

		nvtx.range_push("layer:block_4")
		x = self.layer3(x)
		nvtx.range_pop()

		nvtx.range_push("layer:block_5")
		x = self.layer4(x)
		nvtx.range_pop()

		x = self.avgpool(x)
		x = torch.flatten(x, 1)

		nvtx.range_push("layer:FC")
		x = self.fc(x)
		nvtx.range_pop()

		return x