예제 #1
0
def eval_policy(args, env):
    action_space = env.action_space.n
    print("show action space", action_space)
    # Agent
    dqn = Agent(args, env)

    size = 84
    episode_reward = 0
    eps = 0.1
    for episode in range(2):
        print("Episode ", episode)
        state = env.reset()
        state = torch.tensor(state, dtype=torch.float32,
                             device=args.device).div_(255)
        zeros = torch.zeros_like(state)
        state_buffer = deque([], maxlen=args.history_length)
        state_buffer.append(zeros)
        state_buffer.append(zeros)
        state_buffer.append(zeros)
        state_buffer.append(state)
        state = torch.stack(list(state_buffer), 0)
        for step in range(2):
            action = dqn.act_e_greedy(
                state, eps)  # Choose an action greedily (with noisy weights)
            next_state, reward, done, _ = env.step(action)
            print(reward)
            episode_reward += reward
            if step == 39:
                done = True
            next_state = cv2.resize(next_state, (size, size),
                                    interpolation=cv2.INTER_LINEAR)
            next_state = torch.tensor(next_state,
                                      dtype=torch.float32,
                                      device=args.device).div_(255)
            state_buffer.append(next_state)
            state = torch.stack(list(state_buffer), 0)
        print("Epiosde reward ", episode_reward)
            state = cv2.resize(state[:, :, 0], (84, 84),
                               interpolation=cv2.INTER_LINEAR)
            state = torch.tensor(state,
                                 dtype=torch.float32,
                                 device=args.device).div_(255)
            zeros = torch.zeros_like(state)
            state_buffer = deque([], maxlen=args.history_length)
            state_buffer.append(zeros)
            state_buffer.append(zeros)
            state_buffer.append(zeros)
            state_buffer.append(state)
            state = torch.stack(list(state_buffer), 0)
            print(state.shape)
        if step < 10:
            action = np.random.randint(0, action_space)
        else:
            action = dqn.act_e_greedy(state)  # Choose an action greedily
        print("action", action)
        state, reward, done, _ = env.step(action)  # Step
        state = cv2.resize(state[:, :, 0], (84, 84),
                           interpolation=cv2.INTER_LINEAR)
        state = torch.tensor(state, dtype=torch.float32,
                             device=args.device).div_(255)
        state_buffer.append(state)
        state = torch.stack(list(state_buffer), 0)
        reward_sum += reward
    print(" episode reward", reward_sum)
    T_rewards.append(reward_sum)

print(T_rewards)
예제 #3
0
  dqn.eval()  # Set DQN (online network) to evaluation mode
  avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True)  # Test
  print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q))
else:
  # Training loop
  #print('Training')
  dqn.train()
  T, done = 0, True
  for T in tqdm(range(args.T_max)):
    if done:
      state, done = env.reset(), False
    #print('replay_frequency')
    if T % args.replay_frequency == 0:
      dqn.reset_noise()  # Draw a new set of noisy weights

    action = dqn.act_e_greedy(state)  # Choose an action greedily (with noisy weights)
    next_state, reward, done = env.step(action)  # Step
    #print('reward_clip')

    if args.reward_clip > 0:
      reward = max(min(reward, args.reward_clip), -args.reward_clip)  # Clip rewards
    mem.append(state, action, reward, done)  # Append transition to memory
    T += 1

    # Train and test
    if T >= args.learn_start:
      #print('learn_start')

      mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)  # Anneal importance sampling weight β to 1
      #print('replay_frequency')
      if T % args.replay_frequency == 0:
예제 #4
0
파일: train.py 프로젝트: tevfikoguz/cule
def worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.distributed:
        args.seed += args.gpu
        torch.cuda.set_device(args.gpu)

        args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
        if args.multiprocessing_distributed:
            args.rank = args.rank * ngpus_per_node + args.gpu

        torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632',
                                             world_size=args.world_size, rank=args.rank)
    else:
        args.rank = 0

    args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available()
    args.no_cuda_train = not torch.cuda.is_available()
    args.verbose = args.verbose and (args.rank == 0)

    env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu')
    train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu')

    # Setup
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if args.use_cuda_env or (args.no_cuda_train == False):
        torch.cuda.manual_seed(random.randint(1, 10000))

    if train_device.type == 'cuda':
        print('Train:\n' + cuda_device_str(train_device.index), flush=True)

    if args.use_openai:
        test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes,
                                              episode_life=False, clip_rewards=False)
        test_env.reset()
    else:
        test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray',
                            device='cpu', rescale=True, clip_rewards=False,
                            episodic_life=False, repeat_prob=0.0, frameskip=4)

    # Agent
    dqn = Agent(args, test_env.action_space)

    # Construct validation memory
    if args.rank == 0:
        print(dqn)
        print('Initializing evaluation memory with {} entries...'.format(args.evaluation_size), end='', flush=True)
        start_time = time.time()

    val_mem = initialize_validation(args, train_device)

    if args.rank == 0:
        print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

    if args.evaluate:
        if args.rank == 0:
            eval_start_time = time.time()
            dqn.eval()  # Set DQN (online network) to evaluation mode
            rewards, lengths, avg_Q = test(args, 0, dqn, val_mem, test_env, train_device)
            dqn.train()  # Set DQN (online network) back to training mode
            eval_total_time = time.time() - eval_start_time

            rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
            lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

            print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                  'Avg. Q: {:4.4f} | {}'
                  .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                          lstd, avg_Q, format_time(eval_total_time)),
                  flush=True)
    else:
        if args.rank == 0:
            print('Entering main training loop', flush=True)

            if args.output_filename:
                csv_file = open(args.output_filename, 'w', newline='')
                csv_file.write(json.dumps(vars(args)))
                csv_file.write('\n')
                csv_writer = csv.writer(csv_file, delimiter=',')
                csv_writer.writerow(['frames', 'total_time',
                                     'rmean', 'rmedian', 'rstd', 'rmin', 'rmax',
                                     'lmean', 'lmedian', 'lstd', 'lmin', 'lmax'])
            else:
                csv_writer, csv_file = None, None

            if args.plot:
                from tensorboardX import SummaryWriter
                current_time = datetime.now().strftime('%b%d_%H-%M-%S')
                log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname())
                writer = SummaryWriter(log_dir=log_dir)
                for k, v in vars(args).items():
                    writer.add_text(k, str(v))

            # Environment
            print('Initializing environments...', end='', flush=True)
            start_time = time.time()

        if args.use_openai:
            train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales,
                                                   episode_life=True, clip_rewards=args.reward_clip,
                                                   max_frames=args.max_episode_length)
            observation = torch.from_numpy(train_env.reset()).squeeze(1)
        else:
            train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray',
                                 device=env_device, rescale=True,
                                 clip_rewards=args.reward_clip,
                                 episodic_life=True, repeat_prob=0.0)
            train_env.train()
            observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).clone().squeeze(-1)

        if args.rank == 0:
            print('complete ({})'.format(format_time(time.time() - start_time)), flush=True)

        # These variables are used to compute average rewards for all processes.
        episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32)
        has_completed = torch.zeros(args.num_ales, device=train_device, dtype=torch.bool)

        mem = ReplayMemory(args, args.memory_capacity, train_device)
        mem.reset(observation)
        priority_weight_increase = (1 - args.priority_weight) / (args.t_max - args.learn_start)

        state = torch.zeros((args.num_ales, args.history_length, 84, 84), device=mem.device, dtype=torch.float32)
        state[:, -1] = observation.to(device=mem.device, dtype=torch.float32).div(255.0)

        num_frames_per_iter = args.num_ales
        total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter))
        epsilons = np.linspace(args.epsilon_start, args.epsilon_final, math.ceil(args.epsilon_frames / num_frames_per_iter))
        epsilon_offset = math.ceil(args.learn_start / num_frames_per_iter)

        prefetcher = data_prefetcher(args.batch_size, train_device, mem)

        avg_loss = 'N/A'
        eval_offset = 0
        target_update_offset = 0

        total_time = 0

        # main loop
        iterator = range(total_steps)
        if args.rank == 0:
            iterator = tqdm(iterator)

        env_stream = torch.cuda.Stream()
        train_stream = torch.cuda.Stream()

        for update in iterator:

            T = args.world_size * update * num_frames_per_iter
            epsilon = epsilons[min(update - epsilon_offset, len(epsilons) - 1)] if T >= args.learn_start else epsilons[0]
            start_time = time.time()

            if update % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            dqn.eval()
            nvtx.range_push('train:select action')
            if args.noisy_linear:
                action = dqn.act(state)  # Choose an action greedily (with noisy weights)
            else:
                action = dqn.act_e_greedy(state, epsilon=epsilon)
            nvtx.range_pop()
            dqn.train()

            if args.use_openai:
                action = action.cpu().numpy()

            torch.cuda.synchronize()

            with torch.cuda.stream(env_stream):
                nvtx.range_push('train:env step')
                observation, reward, done, info = train_env.step(action)  # Step

                if args.use_openai:
                    # convert back to pytorch tensors
                    observation = torch.from_numpy(observation).squeeze(1)
                    reward = torch.from_numpy(reward.astype(np.float32))
                    done = torch.from_numpy(done.astype(np.bool))
                    action = torch.from_numpy(action)
                else:
                    observation = observation.clone().squeeze(-1)
                nvtx.range_pop()

                observation = observation.to(device=train_device)
                reward = reward.to(device=train_device)
                done = done.to(device=train_device, dtype=torch.bool)
                action = action.to(device=train_device)

                observation = observation.float().div_(255.0)
                not_done = 1.0 - done.float()

                state[:, :-1].copy_(state[:, 1:].clone())
                state *= not_done.view(-1, 1, 1, 1)
                state[:, -1].copy_(observation)

                # update episodic reward counters
                has_completed |= done

                episode_rewards += reward.float()
                final_rewards[done] = episode_rewards[done]
                episode_rewards *= not_done

                episode_lengths += not_done
                final_lengths[done] = episode_lengths[done]
                episode_lengths *= not_done

            # Train and test
            if T >= args.learn_start:
                mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)  # Anneal importance sampling weight β to 1
                prefetcher.preload()

                avg_loss = 0.0
                num_minibatches = min(int(args.num_ales / args.replay_frequency), 8)
                for _ in range(num_minibatches):
                    # Sample transitions
                    nvtx.range_push('train:sample states')
                    idxs, states, actions, returns, next_states, nonterminals, weights = prefetcher.next()
                    nvtx.range_pop()

                    nvtx.range_push('train:network update')
                    loss = dqn.learn(states, actions, returns, next_states, nonterminals, weights)
                    nvtx.range_pop()

                    nvtx.range_push('train:update priorities')
                    mem.update_priorities(idxs, loss)  # Update priorities of sampled transitions
                    nvtx.range_pop()

                    avg_loss += loss.mean().item()
                avg_loss /= num_minibatches

                # Update target network
                if T >= target_update_offset:
                    dqn.update_target_net()
                    target_update_offset += args.target_update

            torch.cuda.current_stream().wait_stream(env_stream)
            torch.cuda.current_stream().wait_stream(train_stream)

            nvtx.range_push('train:append memory')
            mem.append(observation, action, reward, done)  # Append transition to memory
            nvtx.range_pop()

            total_time += time.time() - start_time

            if args.rank == 0:
                if args.plot and ((update % args.replay_frequency) == 0):
                    writer.add_scalar('train/epsilon', epsilon, T)
                    writer.add_scalar('train/rewards', final_rewards.mean(), T)
                    writer.add_scalar('train/lengths', final_lengths.mean(), T)

                if T >= eval_offset:
                    eval_start_time = time.time()
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    rewards, lengths, avg_Q = test(args, T, dqn, val_mem, test_env, train_device)
                    dqn.train()  # Set DQN (online network) back to training mode
                    eval_total_time = time.time() - eval_start_time
                    eval_offset += args.evaluation_interval

                    rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards)
                    lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths)

                    print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | '
                          'Avg. Q: {:4.4f} | {}'
                          .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax,
                                  lstd, avg_Q, format_time(eval_total_time)),
                          flush=True)

                    if args.output_filename and csv_writer and csv_file:
                        csv_writer.writerow([T, total_time,
                                             rmean, rmedian, rstd, rmin, rmax,
                                             lmean, lmedian, lstd, lmin, lmax])
                        csv_file.flush()

                    if args.plot:
                        writer.add_scalar('eval/rewards', rmean, T)
                        writer.add_scalar('eval/lengths', lmean, T)
                        writer.add_scalar('eval/avg_Q', avg_Q, T)

                loss_str = '{:4.4f}'.format(avg_loss) if isinstance(avg_loss, float) else avg_loss
                progress_data = 'T = {:,} epsilon = {:4.2f} avg reward = {:4.2f} loss: {}' \
                                .format(T, epsilon, final_rewards.mean().item(), loss_str)
                iterator.set_postfix_str(progress_data)

    if args.plot and (args.rank == 0):
        writer.close()

    if args.use_openai:
        train_env.close()
        test_env.close()
예제 #5
0
파일: main.py 프로젝트: xssstory/Rainbow
            last_done_T = T

        if T % args.replay_frequency == 0:
            dqn.reset_noise()  # Draw a new set of noisy weights

        if args.explore_eps is None:
            action = dqn.act(
                state)  # Choose an action greedily (with noisy weights)
        else:
            init_eps = 1
            decay_step = args.explore_eps[0] if args.explore_eps[
                0] > 1 else args.explore_eps[0] * args.T_max
            final_eps = args.explore_eps[1]
            eps = max(init_eps - (init_eps - final_eps) / decay_step * T,
                      final_eps)
            action = dqn.act_e_greedy(state, eps)
        next_state, reward, done, _ = env.step(action)  # Step
        episode_reward += reward
        episode_length += 1
        if args.count_base_bonus > 0:
            if args.deploy_policy == "info-matrix":
                info_index = (T - last_done_T) // args.info_matrix_interval
                reward = reward + args.count_base_bonus / math.sqrt(
                    hash_table.step(
                        state, action, T > args.learn_start
                        and not visited_deploy_flag, True, info_index))
            else:
                reward = reward + args.count_base_bonus / math.sqrt(
                    hash_table.step(
                        state, action, T > args.learn_start
                        and not visited_deploy_flag))