Ejemplo n.º 1
0
def initialize_validation(args, device):
    val_mem = ReplayMemory(args,
                           args.evaluation_size,
                           device=device,
                           num_ales=1)

    val_env = AtariEnv(args.env_name,
                       1,
                       color_mode='gray',
                       device='cpu',
                       rescale=True,
                       episodic_life=True,
                       repeat_prob=0.0)
    val_env.train()

    observation = val_env.reset(initial_steps=100, verbose=False).clone().to(
        device=device, dtype=torch.float32).squeeze(-1).div_(255.0)
    val_mem.reset(observation)

    for _ in range(val_mem.capacity):
        observation, _, done, info = val_env.step(
            val_env.sample_random_actions())
        observation = observation.clone().to(
            device=device, dtype=torch.float32).squeeze(-1).div_(255.0)
        done = done.to(device=device)
        val_mem.append(observation, None, None, done)

    return val_mem
Ejemplo n.º 2
0
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

num_frames = 1400000
batch_size = 32
gamma = 0.99

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()
for frame_idx in tqdm(range(1, num_frames + 1)):
    epsilon = epsilon_by_frame(frame_idx)
    if USE_OPENAI:
        action = model.act(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
    else:
        observation, reward, done, info = env.step(
            torch.rand(256, device=env_device, dtype=torch.float32))

    episode_reward += reward

    if done == True and USE_OPENAI == True:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0

    if frame_idx % 10000 == 0 and USE_OPENAI:
        plot(frame_idx, all_rewards, losses)
Ejemplo n.º 3
0
def worker(gpu, ngpus_per_node, callback, args):
    args.gpu = gpu

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632',
                                             world_size=args.world_size, rank=args.rank)
    else:
        args.rank = 0

    if args.lr_scale:
        scaled_lr = args.lr * math.sqrt((args.num_ales * args.world_size) / 16)
        if args.rank == 0:
            print('Scaled learning rate from {:4.4f} to {:4.4f}'.format(args.lr, scaled_lr))
        args.lr = scaled_lr

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = (not args.no_cuda_train) and torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(np.random.randint(1, 10000))

    env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu')

    if args.rank == 0:
        if args.output_filename:
            train_csv_file = open(args.output_filename, 'w', newline='')
            train_csv_writer = csv.writer(train_csv_file, delimiter=',')
            train_csv_writer.writerow(['frames','fps','total_time',
                                       'rmean','rmedian','rmin','rmax','rstd',
                                       'lmean','lmedian','lmin','lmax','lstd',
                                       'entropy','value_loss','policy_loss'])

            eval_output_filename = '.'.join([''.join(args.output_filename.split('.')[:-1] + ['_test']), 'csv'])
            eval_csv_file = open(eval_output_filename, 'w', newline='')
            eval_csv_file.write(json.dumps(vars(args)))
            eval_csv_file.write('\n')
            eval_csv_writer = csv.writer(eval_csv_file, delimiter=',')
            eval_csv_writer.writerow(['frames','total_time',
                                       'rmean','rmedian','rmin','rmax','rstd',
                                       'lmean','lmedian','lmin','lmax','lstd'])
        else:
            train_csv_file, train_csv_writer = None, None
            eval_csv_file, eval_csv_writer = None, None

        if args.plot:
            from tensorboardX import SummaryWriter
            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
            log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname())
            writer = SummaryWriter(log_dir=log_dir)
            for k, v in vars(args).items():
                writer.add_text(k, str(v))

        print()
        print('PyTorch  : {}'.format(torch.__version__))
        print('CUDA     : {}'.format(torch.backends.cudnn.m.cuda))
        print('CUDNN    : {}'.format(torch.backends.cudnn.version()))
        print('APEX     : {}'.format('.'.join([str(i) for i in apex.amp.__version__.VERSION])))
        print()

    if train_device.type == 'cuda':
        print(cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales,
                                               episode_life=args.episodic_life, clip_rewards=False,
                                               max_frames=args.max_episode_length)
        observation = torch.from_numpy(train_env.reset()).squeeze(1)
    else:
        train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray', repeat_prob=0.0,
                             device=env_device, rescale=True, episodic_life=args.episodic_life,
                             clip_rewards=False, frameskip=4)
        train_env.train()
        observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).squeeze(-1)

    if args.use_openai_test_env:
        test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes,
                                              episode_life=False, clip_rewards=False)
        test_env.reset()
    else:
        test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray', repeat_prob=0.0,
                            device='cpu', rescale=True, episodic_life=False, clip_rewards=False, frameskip=4)

    model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name)
    model = model.to(train_device).train()

    if args.rank == 0:
        print(model)
        args.model_name = model.name()

    if args.use_adam:
        optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
    else:
        optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=args.eps, alpha=args.alpha)

    model, optimizer = amp.initialize(model, optimizer,
                                      opt_level=args.opt_level,
                                      loss_scale=args.loss_scale
                                     )

    if args.distributed:
        model = DDP(model, delay_allreduce=True)

    num_frames_per_iter = args.num_ales * args.num_steps
    total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter))

    shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:])
    states = torch.zeros(shape, device=train_device, dtype=torch.float32)
    states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32)

    shape = (args.num_steps + 1, args.num_ales)
    values  = torch.zeros(shape, device=train_device, dtype=torch.float32)
    returns = torch.zeros(shape, device=train_device, dtype=torch.float32)

    shape = (args.num_steps, args.num_ales)
    rewards = torch.zeros(shape, device=train_device, dtype=torch.float32)
    masks = torch.zeros(shape, device=train_device, dtype=torch.float32)
    actions = torch.zeros(shape, device=train_device, dtype=torch.long)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
    final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
    episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
    final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)

    if args.use_gae:
        gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)

    maybe_npy = lambda a: a.numpy() if args.use_openai else a

    torch.cuda.synchronize()

    iterator = range(total_steps)
    if args.rank == 0:
        iterator = tqdm(iterator)
        total_time = 0
        evaluation_offset = 0

    for update in iterator:

        T = args.world_size * update * num_frames_per_iter
        if (args.rank == 0) and (T >= evaluation_offset):
            evaluation_offset += args.evaluation_interval
            eval_lengths, eval_rewards = evaluate(args, T, total_time, model, test_env, eval_csv_writer, eval_csv_file)

            if args.plot:
                writer.add_scalar('eval/rewards_mean', eval_rewards.mean().item(), T, walltime=total_time)
                writer.add_scalar('eval/lengths_mean', eval_lengths.mean().item(), T, walltime=total_time)

        start_time = time.time()

        with torch.no_grad():

            for step in range(args.num_steps):
                value, logit = model(states[step])

                # store values
                values[step] = value.squeeze(-1)

                # convert actions to numpy and perform next step
                probs_action = F.softmax(logit, dim=1).multinomial(1).to(env_device)
                observation, reward, done, info = train_env.step(maybe_npy(probs_action))

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation)
                    reward = torch.from_numpy(reward)
                    done = torch.from_numpy(done.astype(np.uint8))
                else:
                    observation = observation.squeeze(-1).unsqueeze(1)

                # move back to training memory
                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device, dtype=torch.float32)
                done = done.to(device=train_device)
                probs_action = probs_action.to(device=train_device, dtype=torch.long)

                not_done = 1.0 - done.float()

                # update rewards and actions
                actions[step].copy_(probs_action.view(-1))
                masks[step].copy_(not_done)
                rewards[step].copy_(reward.sign())

                # update next observations
                states[step + 1, :, :-1].copy_(states[step, :, 1:].clone())
                states[step + 1] *= not_done.view(-1, *[1] * (observation.dim() - 1))
                states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:]))

                # update episodic reward counters
                episode_rewards += reward
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done

            returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1)

            if args.use_gae:
                gae.zero_()
                for step in reversed(range(args.num_steps)):
                    delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step]
                    gae = delta + (args.gamma * args.tau * masks[step] * gae)
                    returns[step] = gae + values[step]
            else:
                for step in reversed(range(args.num_steps)):
                    returns[step] = rewards[step] + (args.gamma * returns[step + 1] * masks[step])

        value, logit = model(states[:-1].view(-1, *states.size()[-3:]))

        log_probs = F.log_softmax(logit, dim=1)
        probs = F.softmax(logit, dim=1)

        action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1))
        dist_entropy = -(log_probs * probs).sum(-1).mean()

        advantages = returns[:-1].view(-1).unsqueeze(-1) - value

        value_loss = advantages.pow(2).mean()
        policy_loss = -(advantages.clone().detach() * action_log_probs).mean()

        loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef
        optimizer.zero_grad()
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
        optimizer.step()

        states[0].copy_(states[-1])

        torch.cuda.synchronize()

        if args.rank == 0:
            iter_time = time.time() - start_time
            total_time += iter_time

            if args.plot:
                writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time)
                writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time)
                writer.add_scalar('train/learning_rate', scheduler.get_lr()[0], T, walltime=total_time)
                writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time)
                writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time)
                writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time)

            progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths,
                                     value_loss.item(), policy_loss.item(), dist_entropy.item(),
                                     train_csv_writer, train_csv_file)
            iterator.set_postfix_str(progress_data)

    if args.plot:
        writer.close()

    if args.use_openai:
        train_env.close()
    if args.use_openai_test_env:
        test_env.close()
Ejemplo n.º 4
0
                                      optimizer,
                                      opt_level=opt_level,
                                      loss_scale=loss_scale)

for i in tqdm(range(total_steps)):

    with torch.no_grad():
        for step in range(num_steps):
            value, logit = model(states[step])
            # store values
            values[step] = value.squeeze(-1)

            # convert actions to numpy and perform next step
            probs_action = F.softmax(logit,
                                     dim=1).multinomial(1).to(env_device)
            observation, reward, done, info = train_env.step(probs_action)
            observation = observation.squeeze(-1).unsqueeze(1)

            # move back to training memory
            observation = observation.to(device=train_device)
            reward = reward.to(device=train_device, dtype=torch.float32)
            done = done.to(device=train_device, dtype=torch.bool)
            probs_action = probs_action.to(device=train_device,
                                           dtype=torch.long)

            not_done = 1.0 - done.float()

            # update rewards and actions
            actions[step].copy_(probs_action.view(-1))
            masks[step].copy_(not_done)
            rewards[step].copy_(reward.sign())
Ejemplo n.º 5
0
def worker(gpu, ngpus_per_node, callback, args):
    args.gpu = gpu

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(
            backend='nccl',
            init_method='tcp://127.0.0.1:8632',
            world_size=args.world_size,
            rank=args.rank)
    else:
        args.rank = 0

    if (args.num_ales % args.num_minibatches) != 0:
        raise ValueError(
            'Number of ales({}) size is not even divisible by the minibatch size({})'
            .format(args.num_ales, args.num_minibatches))

    if args.num_steps_per_update == -1:
        args.num_steps_per_update = args.num_steps

    minibatch_size = int(args.num_ales / args.num_minibatches)
    step0 = args.num_steps - args.num_steps_per_update
    n_minibatch = -1

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = (not args.no_cuda_train) and torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    env_device = torch.device(
        'cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (
        args.no_cuda_train == False) else torch.device('cpu')

    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(np.random.randint(1, 10000))

    if args.rank == 0:
        if args.output_filename:
            train_csv_file = open(args.output_filename, 'w', newline='')
            train_csv_file.write(json.dumps(vars(args)))
            train_csv_file.write('\n')
            train_csv_writer = csv.writer(train_csv_file, delimiter=',')
            train_csv_writer.writerow([
                'frames', 'fps', 'total_time', 'rmean', 'rmedian', 'rmin',
                'rmax', 'lmean', 'lmedian', 'lmin', 'lmax', 'entropy',
                'value_loss', 'policy_loss'
            ])

            eval_output_filename = '.'.join([
                ''.join(args.output_filename.split('.')[:-1] + ['_test']),
                'csv'
            ])
            eval_csv_file = open(eval_output_filename, 'w', newline='')
            eval_csv_file.write(json.dumps(vars(args)))
            eval_csv_file.write('\n')
            eval_csv_writer = csv.writer(eval_csv_file, delimiter=',')
            eval_csv_writer.writerow([
                'frames', 'total_time', 'rmean', 'rmedian', 'rmin', 'rmax',
                'rstd', 'lmean', 'lmedian', 'lmin', 'lmax', 'lstd'
            ])
        else:
            train_csv_file, train_csv_writer = None, None
            eval_csv_file, eval_csv_writer = None, None

        if args.plot:
            from tensorboardX import SummaryWriter
            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
            log_dir = os.path.join(args.log_dir,
                                   current_time + '_' + socket.gethostname())
            writer = SummaryWriter(log_dir=log_dir)
            for k, v in vars(args).items():
                writer.add_text(k, str(v))

        print()
        print('PyTorch  : {}'.format(torch.__version__))
        print('CUDA     : {}'.format(torch.backends.cudnn.m.cuda))
        print('CUDNN    : {}'.format(torch.backends.cudnn.version()))
        print('APEX     : {}'.format('.'.join(
            [str(i) for i in apex.amp.__version__.VERSION])))
        print()

    if train_device.type == 'cuda':
        print(cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        train_env = create_vectorize_atari_env(
            args.env_name,
            args.seed,
            args.num_ales,
            episode_life=args.episodic_life,
            clip_rewards=False,
            max_frames=args.max_episode_length)
        observation = torch.from_numpy(train_env.reset()).squeeze(1)
    else:
        train_env = AtariEnv(args.env_name,
                             args.num_ales,
                             color_mode='gray',
                             repeat_prob=0.0,
                             device=env_device,
                             rescale=True,
                             episodic_life=args.episodic_life,
                             clip_rewards=False,
                             frameskip=4)
        train_env.train()
        observation = train_env.reset(initial_steps=args.ale_start_steps,
                                      verbose=args.verbose).squeeze(-1)

    if args.use_openai_test_env:
        test_env = create_vectorize_atari_env(args.env_name,
                                              args.seed,
                                              args.evaluation_episodes,
                                              episode_life=False,
                                              clip_rewards=False)
        test_env.reset()
    else:
        test_env = AtariEnv(args.env_name,
                            args.evaluation_episodes,
                            color_mode='gray',
                            repeat_prob=0.0,
                            device='cpu',
                            rescale=True,
                            episodic_life=False,
                            clip_rewards=False,
                            frameskip=4)

    model = ActorCritic(args.num_stack,
                        train_env.action_space,
                        normalize=args.normalize,
                        name=args.env_name)
    model = model.to(train_device).train()

    if args.rank == 0:
        print(model)
        args.model_name = model.name

    if args.use_adam:
        optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
    else:
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)

    # This is the number of frames GENERATED between two updates
    num_frames_per_iter = args.num_ales * args.num_steps_per_update
    total_steps = math.ceil(args.t_max /
                            (args.world_size * num_frames_per_iter))
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=args.opt_level,
                                      loss_scale=args.loss_scale)

    if args.distributed:
        model = DDP(model, delay_allreduce=True)

    shape = (args.num_steps + 1, args.num_ales, args.num_stack,
             *train_env.observation_space.shape[-2:])
    states = torch.zeros(shape, device=train_device, dtype=torch.float32)
    states[step0, :, -1] = observation.to(device=train_device,
                                          dtype=torch.float32)

    shape = (args.num_steps + 1, args.num_ales)
    values = torch.zeros(shape, device=train_device, dtype=torch.float32)
    logits = torch.zeros(
        (args.num_steps + 1, args.num_ales, train_env.action_space.n),
        device=train_device,
        dtype=torch.float32)
    returns = torch.zeros(shape, device=train_device, dtype=torch.float32)

    shape = (args.num_steps, args.num_ales)
    rewards = torch.zeros(shape, device=train_device, dtype=torch.float32)
    masks = torch.zeros(shape, device=train_device, dtype=torch.float32)
    actions = torch.zeros(shape, device=train_device, dtype=torch.long)

    mus = torch.ones(shape, device=train_device, dtype=torch.float32)
    # pis = torch.zeros(shape, device=train_device, dtype=torch.float32)
    rhos = torch.zeros((args.num_steps, minibatch_size),
                       device=train_device,
                       dtype=torch.float32)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_rewards = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)
    episode_lengths = torch.zeros(args.num_ales,
                                  device=train_device,
                                  dtype=torch.float32)
    final_lengths = torch.zeros(args.num_ales,
                                device=train_device,
                                dtype=torch.float32)

    if args.use_gae:
        raise ValueError('GAE is not compatible with VTRACE')

    maybe_npy = lambda a: a.numpy() if args.use_openai else a

    torch.cuda.synchronize()

    iterator = range(total_steps)
    if args.rank == 0:
        iterator = tqdm(iterator)
        total_time = 0
        evaluation_offset = 0

    for update in iterator:

        T = args.world_size * update * num_frames_per_iter
        if (args.rank == 0) and (T >= evaluation_offset):
            evaluation_offset += args.evaluation_interval
            eval_lengths, eval_rewards = evaluate(args, T, total_time, model,
                                                  test_env, eval_csv_writer,
                                                  eval_csv_file)

            if args.plot:
                writer.add_scalar('eval/rewards_mean',
                                  eval_rewards.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('eval/lengths_mean',
                                  eval_lengths.mean().item(),
                                  T,
                                  walltime=total_time)

        start_time = time.time()

        with torch.no_grad():

            for step in range(args.num_steps_per_update):
                nvtx.range_push('train:step')
                value, logit = model(states[step0 + step])

                # store values and logits
                values[step0 + step] = value.squeeze(-1)

                # convert actions to numpy and perform next step
                probs = torch.clamp(F.softmax(logit, dim=1),
                                    min=0.00001,
                                    max=0.99999)
                probs_action = probs.multinomial(1).to(env_device)
                # Check if the multinomial threw an exception
                # https://github.com/pytorch/pytorch/issues/7014
                torch.cuda.current_stream().synchronize()
                observation, reward, done, info = train_env.step(
                    maybe_npy(probs_action))

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation)
                    reward = torch.from_numpy(reward)
                    done = torch.from_numpy(done.astype(np.uint8))
                else:
                    observation = observation.squeeze(-1).unsqueeze(1)

                # move back to training memory
                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device, dtype=torch.float32)
                done = done.to(device=train_device)
                probs_action = probs_action.to(device=train_device,
                                               dtype=torch.long)

                not_done = 1.0 - done.float()

                # update rewards and actions
                actions[step0 + step].copy_(probs_action.view(-1))
                masks[step0 + step].copy_(not_done)
                rewards[step0 + step].copy_(reward.sign())

                #mus[step0 + step] = F.softmax(logit, dim=1).gather(1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1)
                mus[step0 + step] = torch.clamp(F.softmax(logit, dim=1).gather(
                    1, actions[step0 + step].view(-1).unsqueeze(-1)).view(-1),
                                                min=0.00001,
                                                max=0.99999)

                # update next observations
                states[step0 + step + 1, :, :-1].copy_(states[step0 + step, :,
                                                              1:])
                states[step0 + step + 1] *= not_done.view(
                    -1, *[1] * (observation.dim() - 1))
                states[step0 + step + 1, :,
                       -1].copy_(observation.view(-1,
                                                  *states.size()[-2:]))

                # update episodic reward counters
                episode_rewards += reward
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done
                nvtx.range_pop()

        n_minibatch = (n_minibatch + 1) % args.num_minibatches
        min_ale_index = int(n_minibatch * minibatch_size)
        max_ale_index = min_ale_index + minibatch_size

        # compute v-trace using the recursive method (remark 1 in IMPALA paper)
        # value_next_step, logit = model(states[-1:, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:]))
        # returns[-1, min_ale_index:max_ale_index] = value_next_step.squeeze()
        # for step in reversed(range(args.num_steps)):
        #     value, logit = model(states[step, min_ale_index:max_ale_index, :, : ,:].contiguous().view(-1, *states.size()[-3:]))
        #     pis = F.softmax(logit, dim=1).gather(1, actions[step, min_ale_index:max_ale_index].view(-1).unsqueeze(-1)).view(-1)
        #     c = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=c_)
        #     rhos[step, :] = torch.clamp(pis / mus[step, min_ale_index:max_ale_index], max=rho_)
        #     delta_value = rhos[step, :] * (rewards[step, min_ale_index:max_ale_index] + (args.gamma * value_next_step - value).squeeze())
        #     returns[step, min_ale_index:max_ale_index] = value.squeeze() + delta_value + args.gamma * c * \
        #             (returns[step + 1, min_ale_index:max_ale_index] - value_next_step.squeeze())
        #     value_next_step = value

        nvtx.range_push('train:compute_values')
        value, logit = model(
            states[:, min_ale_index:max_ale_index, :, :, :].contiguous().view(
                -1,
                *states.size()[-3:]))
        batch_value = value.detach().view((args.num_steps + 1, minibatch_size))
        batch_probs = F.softmax(logit.detach()[:(args.num_steps *
                                                 minibatch_size), :],
                                dim=1)
        batch_pis = batch_probs.gather(
            1, actions[:, min_ale_index:max_ale_index].contiguous().view(
                -1).unsqueeze(-1)).view((args.num_steps, minibatch_size))
        returns[-1, min_ale_index:max_ale_index] = batch_value[-1]

        with torch.no_grad():
            for step in reversed(range(args.num_steps)):
                c = torch.clamp(batch_pis[step, :] /
                                mus[step, min_ale_index:max_ale_index],
                                max=args.c_hat)
                rhos[step, :] = torch.clamp(
                    batch_pis[step, :] /
                    mus[step, min_ale_index:max_ale_index],
                    max=args.rho_hat)
                delta_value = rhos[step, :] * (
                    rewards[step, min_ale_index:max_ale_index] +
                    (args.gamma * batch_value[step + 1] -
                     batch_value[step]).squeeze())
                returns[step, min_ale_index:max_ale_index] = \
                        batch_value[step, :].squeeze() + delta_value + args.gamma * c * \
                        (returns[step + 1, min_ale_index:max_ale_index] - batch_value[step + 1, :].squeeze())

        value = value[:args.num_steps * minibatch_size, :]
        logit = logit[:args.num_steps * minibatch_size, :]

        log_probs = F.log_softmax(logit, dim=1)
        probs = F.softmax(logit, dim=1)

        action_log_probs = log_probs.gather(
            1, actions[:, min_ale_index:max_ale_index].contiguous().view(
                -1).unsqueeze(-1))
        dist_entropy = -(log_probs * probs).sum(-1).mean()

        advantages = returns[:-1, min_ale_index:max_ale_index].contiguous(
        ).view(-1).unsqueeze(-1) - value

        value_loss = advantages.pow(2).mean()
        policy_loss = -(action_log_probs * rhos.view(-1, 1).detach() * \
                (rewards[:, min_ale_index:max_ale_index].contiguous().view(-1, 1) + args.gamma * \
                returns[1:, min_ale_index:max_ale_index].contiguous().view(-1, 1) - value).detach()).mean()
        nvtx.range_pop()

        nvtx.range_push('train:backprop')
        loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef
        optimizer.zero_grad()
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                       args.max_grad_norm)
        optimizer.step()
        nvtx.range_pop()

        nvtx.range_push('train:next_states')
        for step in range(0, args.num_steps_per_update):
            states[:-1, :, :, :, :] = states[1:, :, :, :, :]
            rewards[:-1, :] = rewards[1:, :]
            actions[:-1, :] = actions[1:, :]
            masks[:-1, :] = masks[1:, :]
            mus[:-1, :] = mus[1:, :]
        nvtx.range_pop()

        torch.cuda.synchronize()

        if args.rank == 0:
            iter_time = time.time() - start_time
            total_time += iter_time

            if args.plot:
                writer.add_scalar('train/rewards_mean',
                                  final_rewards.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/lengths_mean',
                                  final_lengths.mean().item(),
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/value_loss',
                                  value_loss,
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/policy_loss',
                                  policy_loss,
                                  T,
                                  walltime=total_time)
                writer.add_scalar('train/entropy',
                                  dist_entropy,
                                  T,
                                  walltime=total_time)

            progress_data = callback(args, model, T, iter_time, final_rewards,
                                     final_lengths, value_loss, policy_loss,
                                     dist_entropy, train_csv_writer,
                                     train_csv_file)
            iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
    if args.use_openai_test_env:
        test_env.close()
Ejemplo n.º 6
0
def worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632',
                                             world_size=args.world_size, rank=args.rank)
    else:
        args.rank = 0

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = not torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu')

    # Setup
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(random.randint(1, 10000))

    if train_device.type == 'cuda':
        print('Train:\n' + cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes,
                                              episode_life=False, clip_rewards=False)
        test_env.reset()
    else:
        test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray',
                            device='cpu', rescale=True, clip_rewards=False,
                            episodic_life=False, repeat_prob=0.0, frameskip=4)

    # Agent
    dqn = Agent(args, test_env.action_space)

    # Construct validation memory
    if args.rank == 0:
        print(dqn)
        print('Initializing evaluation memory with {} entries...'.format(args.evaluation_size), end='', flush=True)
        start_time = time.time()

    val_mem = initialize_validation(args, train_device)

    if args.rank == 0:
        print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

    if args.evaluate:
        if args.rank == 0:
            eval_start_time = time.time()
            dqn.eval()  # Set DQN (online network) to evaluation mode
            rewards, lengths, avg_Q = test(args, 0, dqn, val_mem, test_env, train_device)
            dqn.train()  # Set DQN (online network) back to training mode
            eval_total_time = time.time() - eval_start_time

            rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
            lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

            print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'Avg. Q: {:4.4f} | {}'
                  .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                          lstd, avg_Q, format_time(eval_total_time)),
                  flush=True)
    else:
        if args.rank == 0:
            print('Entering main training loop', flush=True)

            if args.output_filename:
                csv_file = open(args.output_filename, 'w', newline='')
                csv_file.write(json.dumps(vars(args)))
                csv_file.write('\n')
                csv_writer = csv.writer(csv_file, delimiter=',')
                csv_writer.writerow(['frames', 'total_time',
                                     'rmean', 'rmedian', 'rstd', 'rmin', 'rmax',
                                     'lmean', 'lmedian', 'lstd', 'lmin', 'lmax'])
            else:
                csv_writer, csv_file = None, None

            if args.plot:
                from tensorboardX import SummaryWriter
                current_time = datetime.now().strftime('%b%d_%H-%M-%S')
                log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname())
                writer = SummaryWriter(log_dir=log_dir)
                for k, v in vars(args).items():
                    writer.add_text(k, str(v))

            # Environment
            print('Initializing environments...', end='', flush=True)
            start_time = time.time()

        if args.use_openai:
            train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales,
                                                   episode_life=True, clip_rewards=args.reward_clip,
                                                   max_frames=args.max_episode_length)
            observation = torch.from_numpy(train_env.reset()).squeeze(1)
        else:
            train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray',
                                 device=env_device, rescale=True,
                                 clip_rewards=args.reward_clip,
                                 episodic_life=True, repeat_prob=0.0)
            train_env.train()
            observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).clone().squeeze(-1)

        if args.rank == 0:
            print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

        # These variables are used to compute average rewards for all processes.
        episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        has_completed = torch.zeros(args.num_ales, device=train_device, dtype=torch.bool)

        mem = ReplayMemory(args, args.memory_capacity, train_device)
        mem.reset(observation)
        priority_weight_increase = (1 - args.priority_weight) / (args.t_max - args.learn_start)

        state = torch.zeros((args.num_ales, args.history_length, 84, 84), device=mem.device, dtype=torch.float32)
        state[:, -1] = observation.to(device=mem.device, dtype=torch.float32).div(255.0)

        num_frames_per_iter = args.num_ales
        total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter))
        epsilons = np.linspace(args.epsilon_start, args.epsilon_final, math.ceil(args.epsilon_frames / num_frames_per_iter))
        epsilon_offset = math.ceil(args.learn_start / num_frames_per_iter)

        prefetcher = data_prefetcher(args.batch_size, train_device, mem)

        avg_loss = 'N/A'
        eval_offset = 0
        target_update_offset = 0

        total_time = 0

        # main loop
        iterator = range(total_steps)
        if args.rank == 0:
            iterator = tqdm(iterator)

        env_stream = torch.cuda.Stream()
        train_stream = torch.cuda.Stream()

        for update in iterator:

            T = args.world_size * update * num_frames_per_iter
            epsilon = epsilons[min(update - epsilon_offset, len(epsilons) - 1)] if T >= args.learn_start else epsilons[0]
            start_time = time.time()

            if update % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            dqn.eval()
            nvtx.range_push('train:select action')
            if args.noisy_linear:
                action = dqn.act(state)  # Choose an action greedily (with noisy weights)
            else:
                action = dqn.act_e_greedy(state, epsilon=epsilon)
            nvtx.range_pop()
            dqn.train()

            if args.use_openai:
                action = action.cpu().numpy()

            torch.cuda.synchronize()

            with torch.cuda.stream(env_stream):
                nvtx.range_push('train:env step')
                observation, reward, done, info = train_env.step(action)  # Step

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation).squeeze(1)
                    reward = torch.from_numpy(reward.astype(np.float32))
                    done = torch.from_numpy(done.astype(np.bool))
                    action = torch.from_numpy(action)
                else:
                    observation = observation.clone().squeeze(-1)
                nvtx.range_pop()

                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device)
                done = done.to(device=train_device, dtype=torch.bool)
                action = action.to(device=train_device)

                observation = observation.float().div_(255.0)
                not_done = 1.0 - done.float()

                state[:, :-1].copy_(state[:, 1:].clone())
                state *= not_done.view(-1, 1, 1, 1)
                state[:, -1].copy_(observation)

                # update episodic reward counters
                has_completed |= done

                episode_rewards += reward.float()
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done

            # Train and test
            if T >= args.learn_start:
                mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)  # Anneal importance sampling weight β to 1
                prefetcher.preload()

                avg_loss = 0.0
                num_minibatches = min(int(args.num_ales / args.replay_frequency), 8)
                for _ in range(num_minibatches):
                    # Sample transitions
                    nvtx.range_push('train:sample states')
                    idxs, states, actions, returns, next_states, nonterminals, weights = prefetcher.next()
                    nvtx.range_pop()

                    nvtx.range_push('train:network update')
                    loss = dqn.learn(states, actions, returns, next_states, nonterminals, weights)
                    nvtx.range_pop()

                    nvtx.range_push('train:update priorities')
                    mem.update_priorities(idxs, loss)  # Update priorities of sampled transitions
                    nvtx.range_pop()

                    avg_loss += loss.mean().item()
                avg_loss /= num_minibatches

                # Update target network
                if T >= target_update_offset:
                    dqn.update_target_net()
                    target_update_offset += args.target_update

            torch.cuda.current_stream().wait_stream(env_stream)
            torch.cuda.current_stream().wait_stream(train_stream)

            nvtx.range_push('train:append memory')
            mem.append(observation, action, reward, done)  # Append transition to memory
            nvtx.range_pop()

            total_time += time.time() - start_time

            if args.rank == 0:
                if args.plot and ((update % args.replay_frequency) == 0):
                    writer.add_scalar('train/epsilon', epsilon, T)
                    writer.add_scalar('train/rewards', final_rewards.mean(), T)
                    writer.add_scalar('train/lengths', final_lengths.mean(), T)

                if T >= eval_offset:
                    eval_start_time = time.time()
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    rewards, lengths, avg_Q = test(args, T, dqn, val_mem, test_env, train_device)
                    dqn.train()  # Set DQN (online network) back to training mode
                    eval_total_time = time.time() - eval_start_time
                    eval_offset += args.evaluation_interval

                    rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
                    lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

                    print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'Avg. Q: {:4.4f} | {}'
                          .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                                  lstd, avg_Q, format_time(eval_total_time)),
                          flush=True)

                    if args.output_filename and csv_writer and csv_file:
                        csv_writer.writerow([T, total_time,
                                             rmean, rmedian, rstd, rmin, rmax,
                                             lmean, lmedian, lstd, lmin, lmax])
                        csv_file.flush()

                    if args.plot:
                        writer.add_scalar('eval/rewards', rmean, T)
                        writer.add_scalar('eval/lengths', lmean, T)
                        writer.add_scalar('eval/avg_Q', avg_Q, T)

                loss_str = '{:4.4f}'.format(avg_loss) if isinstance(avg_loss, float) else avg_loss
                progress_data = 'T = {:,} epsilon = {:4.2f} avg reward = {:4.2f} loss: {}' \
                                .format(T, epsilon, final_rewards.mean().item(), loss_str)
                iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
        test_env.close()