Exemple #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='gvgai-testgame1-lvl0-v0')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--model_dir', type=str, default=None)

    args = parser.parse_args()
    set_global_seeds(args.seed)
    env, does_need_action_direction, game_name = create_gvgai_environment(
        args.env)

    model_dir = "models/{}/".format(game_name)
    os.makedirs(model_dir, exist_ok=True)
    player_processes, player_connections = create_players(
        args.env, model_dir, 0.1, args.num_timesteps, 0.01, False, 8)

    import models
    from simple import learn

    if does_need_action_direction:
        model = models.cnn_to_mlp_with_action_direction(
            convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
            hiddens=[256],
            dueling=bool(args.dueling),
        )
    else:
        model = models.cnn_to_mlp(
            convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
            hiddens=[256],
            dueling=bool(args.dueling),
        )
    env.close()
    if args.model_dir is not None:
        model_dir = args.model_dir

    learn(args.env,
          q_func=model,
          lr=1e-4,
          max_timesteps=args.num_timesteps,
          buffer_size=1000,
          exploration_fraction=0.1,
          exploration_final_eps=0.01,
          train_freq=1,
          learning_starts=500,
          target_network_update_freq=100,
          gamma=0.99,
          prioritized_replay=bool(args.prioritized),
          prioritized_replay_alpha=args.prioritized_replay_alpha,
          checkpoint_freq=args.checkpoint_freq,
          model_dir=model_dir,
          player_processes=player_processes,
          player_connections=player_connections)
Exemple #2
0
    def __init__(self, config, env_creator):
        self.config = config
        self.local_timestep = 0
        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]

        if "cartpole" in self.config["env_config"]:
            self.env = env_creator(self.config["env_config"])
        else:
            self.env = wrap_deepmind(
                env_creator(self.config["env_config"]),
                clip_rewards=False, frame_stack=True, scale=True)
        self.obs = self.env.reset()

        self.sess = U.make_session()
        self.sess.__enter__()

        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph
        observation_space_shape = self.env.observation_space.shape
        def make_obs_ph(name):
            return BatchInput(observation_space_shape, name=name)

        if "cartpole" in self.config["env_config"]:
            q_func = models.mlp([64])
        else:
            q_func = models.cnn_to_mlp(
                convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                hiddens=[256],
                dueling=True,
            )

        act, self.train, self.update_target, debug = build_train(
            make_obs_ph=make_obs_ph,
            q_func=q_func,
            num_actions=self.env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.config["lr"]),
            gamma=self.config["gamma"],
            grad_norm_clipping=10,
            param_noise=False
        )

        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': q_func,
            'num_actions': self.env.action_space.n,
        }

        self.act = ActWrapper(act, act_params)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(self.config["exploration_fraction"] * self.config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=self.config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()
Exemple #3
0
def main(env, dueling=True, **kwargs):
    env = FireResetEnv(MaxAndSkipEnv(NoopResetEnv(gym.make(env))))
    # Or equivalent using gym_tensorflow
    #env = gym_tensorflow.make(env, 1)
    model = models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[512],
        dueling=bool(dueling),
    )
    act = demo(env, q_func=model, dueling=True, **kwargs)
Exemple #4
0
def main(env, num_timesteps=int(10e6), dueling=True, **kwargs):
    env_f = lambda batch_size: gym_tensorflow.make(env, batch_size=batch_size)
    model = models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[512],
        dueling=bool(dueling),
    )
    act = learn(env_f,
                q_func=model,
                max_timesteps=int(num_timesteps),
                dueling=True,
                **kwargs)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='Breakout')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('experiment_id')
    args = parser.parse_args()
    logging_directory = Path('./experiments/{}--{}'.format(args.experiment_id, args.env))
    if not logging_directory.exists():
        logging_directory.mkdir(parents=True)
    logger.configure(str(logging_directory), ['stdout', 'tensorboard', 'json'])
    model_directory = logging_directory / 'models'
    if not model_directory.exists():
        model_directory.mkdir(parents=True)
    set_global_seeds(args.seed)
    env_name = args.env + "NoFrameskip-v4"
    env = make_atari(env_name)
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)
    model = models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
    )
    exploration_schedule = PiecewiseSchedule(
        endpoints=[(0, 1), (1e6, 0.1), (5 * 1e6, 0.01)], outside_value=0.01)

    act = learn(
        env,
        q_func=model,
        beta1=0.9,
        beta2=0.99,
        epsilon=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=1000000,
        exploration_schedule=exploration_schedule,
        start_lr=1e-4,
        end_lr=5 * 1e-5,
        start_step=1e6,
        end_step=5 * 1e6,
        train_freq=4,
        print_freq=10,
        batch_size=32,
        learning_starts=50000,
        target_network_update_freq=10000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        model_directory=model_directory
    )
    act.save(str(model_directory / "act_model.pkl"))
    env.close()
Exemple #6
0
def test():
    from baselines0.deepq.utils import BatchInput
    import json
    learning_prop = json.load(
        open(os.path.join(args.log_dir, 'learning_prop.json'), 'r'))

    env = make_atari(args.env)
    env = models.wrap_atari_dqn(env)
    observation_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return BatchInput(observation_space_shape, name=name)

    model = models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[learning_prop['num_units']] * learning_prop['num_layers'],
        dueling=bool(args.dueling),
        init_mean=args.init_mean,
        init_sd=args.init_sd,
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': model,
        'scope': learning_prop['scope'],
        'eps': args.test_eps
    }
    act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params)
    if args.record:
        env = Monitor(env, directory=args.log_dir)
    episode_rew = 0
    t = 0
    while True:
        obs, done = env.reset(), False
        while (not done):
            if args.render:
                env.render()
                time.sleep(0.05)
            obs, rew, done, info = env.step(act(obs[None])[0])
            # Reset only the enviornment but not the recorder
            if args.record and done:
                obs, done = env.env.reset(), False
            episode_rew += rew
            t += 1
        if info['ale.lives'] == 0:
            print("Episode reward %.2f after %d steps" % (episode_rew, t))
            episode_rew = 0
            t = 0
def main(max_timesteps):
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys])
    
#    def trainTabular(vectorKey,qCurrTargets,weights):
    def trainTabular(vectorKey,qCurrTargets,weights):
        keys = getTabularKeys(vectorKey)
        alpha=0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
#                q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]


    env = envstandalone.BlockArrange()

    # Standard q-learning parameters
#    max_timesteps=8000
#    exploration_fraction=0.3
    exploration_fraction=1
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=10
    buffer_size=10000
    batch_size=10
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
    actionShape = (3,3,2)
    num_states = 2 # either holding or not
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
#    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay=True
#    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(32,3,1)],
        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(actionShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=actionShape)
    
    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_notholding"
                )
        getqHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_holding"
                )
    
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_notholding",
            grad_norm_clipping=1.
        )

        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_holding",
            grad_norm_clipping=1.
        )
        
    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        # Get qCurr values
        if valueFunctionType == "TABULAR":
            actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,actionShape[0]*actionShape[1]*actionShape[2]]) == 1
            qCurr = getTabular(actionDescriptorsFlat)
        else:
            qCurrNotHolding = getqNotHolding(actionDescriptors)
            qCurrHolding = getqHolding(actionDescriptors)
            qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext = moveDescriptorsNext*2-1

            actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3)
            actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3)
            actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention!
            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,actionShape[0],actionShape[1],actionShape[2]])

            if valueFunctionType == "TABULAR":
                actionDescriptorsNextFlat = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,-1]) == 1
                qNextFlat = getTabular(actionDescriptorsNextFlat)
            else:
                qNextNotHolding = getqNotHolding(actionDescriptorsNext)
                qNextHolding = getqHolding(actionDescriptorsNext)
                qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1)

            qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1)
            targets = rewards + (1-dones) * gamma * qNextmax
            
            if valueFunctionType == "TABULAR":
                actionsFlat = np.reshape(actionPatches,[batch_size,-1]) == 1
                qCurrTarget = getTabular(actionsFlat)
            else:
                qCurrTargetNotHolding = getqNotHolding(actionPatches)
                qCurrTargetHolding = getqHolding(actionPatches)
                qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1)

            td_error = qCurrTarget[range(batch_size),states_t] - targets
            qCurrTarget[range(batch_size),states_t] = targets

            if valueFunctionType == "TABULAR":
                trainTabular(actionsFlat, qCurrTarget, np.tile(np.reshape(weights,[batch_size,1]),[1,2]))
            else:
                targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
                targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)

    # save learning curve
    filename = 'BAR2_deictic_rewards_' +str(num_patches) + "_" + str(max_timesteps) + '.dat'
    np.savetxt(filename,episode_rewards)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    if valueFunctionType == "TABULAR":
        qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    else:
        qPickNotHolding = getqNotHolding(actionsPickDescriptors)
        qPickHolding = getqHolding(actionsPickDescriptors)
        qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    if valueFunctionType == "TABULAR":
        qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    else:
        qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
        qPlaceHolding = getqHolding(actionsPlaceDescriptors)
        qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)    
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
Exemple #8
0
def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else 10 *
            np.ones(num_states) for x in keys
        ])

#    def trainTabular(vectorKey,qCurrTargets,weights):

    def trainTabular(vectorKey, qCurrTargets, weights):
        keys = getTabularKeys(vectorKey)
        alpha = 0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                #                q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[
                    keys[i]] = q_func_tabular[keys[i]] + alpha * weights[i] * (
                        qCurrTargets[i] - q_func_tabular[keys[i]]
                    )  # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

    # Return a list of actions in adjacent patches to <action>
    def getAdjacentActions(action):
        side = len(env.moveCenters)
        mat = np.reshape(range(side**2), [side, side])
        move = action
        if action >= side**2:
            move = action - side**2
        coords = np.squeeze(np.nonzero(mat == move))
        adjacent = []

        # this cell
        adjacent.append(coords)

        # 8-neighborhood
        adjacent.append(coords - [0, 1])
        adjacent.append(coords + [0, 1])
        adjacent.append(coords - [1, 0])
        adjacent.append(coords + [1, 0])
        adjacent.append(coords + [-1, -1])
        adjacent.append(coords + [1, -1])
        adjacent.append(coords + [-1, 1])
        adjacent.append(coords + [1, 1])

        # 16-neighborhood
        adjacent.append(coords + [-2, 2])
        adjacent.append(coords + [-1, 2])
        adjacent.append(coords + [0, 2])
        adjacent.append(coords + [1, 2])
        adjacent.append(coords + [2, 2])
        adjacent.append(coords + [2, 1])
        adjacent.append(coords + [2, 0])
        adjacent.append(coords + [2, -1])
        adjacent.append(coords + [2, -2])
        adjacent.append(coords + [1, -2])
        adjacent.append(coords + [0, -2])
        adjacent.append(coords + [-1, -2])
        adjacent.append(coords + [-2, -2])
        adjacent.append(coords + [-2, -1])
        adjacent.append(coords + [-2, 0])
        adjacent.append(coords + [-2, 1])

        adjacentValid = [x for x in adjacent if all(x < side) and all(x >= 0)]
        if action >= side**2:
            return [side**2 + x[0] * side + x[1] for x in adjacentValid]
        else:
            return [x[0] * side + x[1] for x in adjacentValid]

    env = envstandalone.NumbersArrange()

    # Standard q-learning parameters
    max_timesteps = 2000
    exploration_fraction = 0.3
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    buffer_size = 1000
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    # first two elts of deicticShape must be odd
    descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2)
    #    descriptorShapeSmall = (10,10,2)
    #    descriptorShapeSmall = (14,14,2)
    descriptorShapeSmall = (20, 20, 2)
    num_states = 2  # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2 * num_patches
    num_actions_discrete = 2
    num_patches_side = len(env.moveCenters)
    #    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
    #    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE"  # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    #    prioritized_replay=True
    prioritized_replay = False
    #    prioritized_replay_alpha=1.0
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    #    prioritized_replay_beta_iters=20000
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
        #    q_func = models.cnn_to_mlp_2pathways(
        #        convs=[(16,3,1), (32,3,1)],
        #        hiddens=[48],
        convs=[(16, 3, 1)],
        hiddens=[32],
        #        convs=[(32,3,1)],
        #        hiddens=[48],
        #        convs=[(48,3,1)],
        #        hiddens=[48],
        dueling=True)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)

    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                    q_func=q_func,
                                    num_states=num_states,
                                    num_cascade=5,
                                    scope="deepq",
                                    qscope="q_func_notholding")
        getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                 q_func=q_func,
                                 num_states=num_states,
                                 num_cascade=5,
                                 scope="deepq",
                                 qscope="q_func_holding")
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_notholding",
            grad_norm_clipping=1.)
        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_holding",
            grad_norm_clipping=1.)

        getqNotHoldingCoarse = build_getq(
            make_actionDeic_ph=make_actionDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="q_func_notholding_coarse")
        getqHoldingCoarse = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                       q_func=q_func,
                                       num_states=num_states,
                                       num_cascade=5,
                                       scope="deepq",
                                       qscope="q_func_holding_coarse")
        targetTrainNotHoldingCoarse = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            #            optimizer=tf.train.AdamOptimizer(learning_rate=lr*20),
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_notholding_coarse",
            grad_norm_clipping=None)
        targetTrainHoldingCoarse = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            #            optimizer=tf.train.AdamOptimizer(learning_rate=lr*20),
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_holding_coarse",
            grad_norm_clipping=None)

        sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    newEpisode = 0
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors * 2 - 1
        actionsPickDescriptors = np.stack(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.stack(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        qCurrNotHolding = getqNotHolding(actionDescriptors)
        qCurrHolding = getqHolding(actionDescriptors)
        qCurr = np.concatenate([qCurrNotHolding, qCurrHolding], axis=1)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:, obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _, idx, inv = np.unique(actionDescriptors,
                                    axis=0,
                                    return_index=True,
                                    return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx, obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv == actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        adjacentActions = getAdjacentActions(action)

        # take action
        new_obs, rew, done, _ = env.step(action)

        #        replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done))
        replay_buffer.add(obs[1], actionDescriptors[action, :],
                          actionDescriptors[adjacentActions, :], np.copy(rew),
                          np.copy(new_obs), np.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext = moveDescriptorsNext * 2 - 1
            actionsPickDescriptorsNext = np.stack(
                [moveDescriptorsNext,
                 np.zeros(np.shape(moveDescriptorsNext))],
                axis=3)
            actionsPlaceDescriptorsNext = np.stack(
                [np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],
                axis=3)
            actionDescriptorsNext = np.stack(
                [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext],
                axis=1
            )  # I sometimes get this axis parameter wrong... pay attention!

            # flat estimate of qNextmax
            actionDescriptorsNext = np.reshape(actionDescriptorsNext, [
                batch_size * num_patches * num_actions_discrete,
                descriptorShapeSmall[0], descriptorShapeSmall[1],
                descriptorShapeSmall[2]
            ])
            qNextNotHolding = getqNotHolding(actionDescriptorsNext)
            qNextHolding = getqHolding(actionDescriptorsNext)
            qNextFlat = np.concatenate([qNextNotHolding, qNextHolding], axis=1)
            qNext = np.reshape(
                qNextFlat,
                [batch_size, num_patches, num_actions_discrete, num_states])
            qNextmax = np.max(
                np.max(qNext[range(batch_size), :, :, states_tp1], 2), 1)

            #            # coarse/fine estimate of qNextmax
            #            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size,num_patches_side,num_patches_side,num_actions_discrete,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]])
            #            aa = actionDescriptorsNext[:,range(0,num_patches_side,2),:,:,:,:,:]
            #            bb = aa[:,:,range(0,num_patches_side,2),:,:,:,:]
            #            cc = np.reshape(bb,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]])
            #            qNextNotHolding = getqNotHoldingCoarse(cc)
            #            qNextHolding = getqHoldingCoarse(cc)
            #            qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1)
            #            qNext = np.reshape(qNextFlat,[batch_size,-1,num_actions_discrete,num_states])
            #            qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1)

            targets = rewards + (1 - dones) * gamma * qNextmax

            # train action Patches
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets
            targetTrainNotHolding(
                actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHolding(actionPatches,
                               np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                               np.reshape(weights, [batch_size, 1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        newEpisode = 0
        if done:
            newEpisode = 1
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart) + ", tderror: " +
                  str(mean_100ep_tderror))
            timerStart = timerFinal

        obs = np.copy(new_obs)

        # Train coarse grid
        if newEpisode:
            moveDescriptors = getMoveActionDescriptors([obs[0]])
            moveDescriptors = moveDescriptors * 2 - 1
            actionsPickDescriptors = np.stack(
                [moveDescriptors,
                 np.zeros(np.shape(moveDescriptors))], axis=3)
            actionsPlaceDescriptors = np.stack(
                [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
            actionDescriptors = np.r_[actionsPickDescriptors,
                                      actionsPlaceDescriptors]
            #            actionDescriptors, inverseIdx = np.unique(actionDescriptors,axis=0,return_inverse=True) # reduce to just unique descriptors
            qCurrNotHolding = getqNotHolding(actionDescriptors)
            qCurrHolding = getqHolding(actionDescriptors)
            qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding))
            qTargetHolding = np.zeros(np.shape(qCurrHolding))
            for jj in range(num_actions):
                adj = getAdjacentActions(jj)
                qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj])
                qTargetHolding[jj] = np.max(qCurrHolding[adj])
            for iter in range(10):
                targetTrainNotHoldingCoarse(
                    actionDescriptors, np.reshape(qTargetNotHolding, [-1, 1]),
                    np.ones([num_actions, 1]))
                targetTrainHoldingCoarse(actionDescriptors,
                                         np.reshape(qTargetHolding, [-1, 1]),
                                         np.ones([num_actions, 1]))


#    # Train coarse grid
#    for iter in range(500):
#        print(str(iter))
#        obs = env.reset()
#        moveDescriptors = getMoveActionDescriptors([obs[0]])
#        moveDescriptors = moveDescriptors*2-1
#        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
#        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
#        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]
#        qCurrNotHolding = getqNotHolding(actionDescriptors)
#        qCurrHolding = getqHolding(actionDescriptors)
#        qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding))
#        qTargetHolding = np.zeros(np.shape(qCurrHolding))
#        for jj in range(num_actions):
#            adj = getAdjacentActions(jj)
#            qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj])
#            qTargetHolding[jj] = np.max(qCurrHolding[adj])
#        targetTrainNotHoldingCoarse(actionDescriptors, np.reshape(qTargetNotHolding,[-1,1]), np.ones([num_actions,1]))
#        targetTrainHoldingCoarse(actionDescriptors, np.reshape(qTargetHolding,[-1,1]), np.ones([num_actions,1]))

# display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))

    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [gridSize, gridSize])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:, 1], [gridSize, gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:, 0], [gridSize, gridSize])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize])))

    qPickNotHolding = getqNotHoldingCoarse(actionsPickDescriptors)
    qPickHolding = getqHoldingCoarse(actionsPickDescriptors)
    qPickCoarse = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPickCoarse[:, 1], [gridSize, gridSize])))

    qPlaceNotHolding = getqNotHoldingCoarse(actionsPlaceDescriptors)
    qPlaceHolding = getqHoldingCoarse(actionsPlaceDescriptors)
    qPlaceCoarse = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlaceCoarse[:, 0], [gridSize, gridSize])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize])))

    plt.subplot(2, 3, 1)
    plt.imshow(np.tile(env.state[0], [1, 1, 3]))
    plt.subplot(2, 3, 2)
    plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize]))
    plt.subplot(2, 3, 3)
    plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize]))
    plt.subplot(2, 3, 5)
    plt.imshow(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize]))
    plt.subplot(2, 3, 6)
    plt.imshow(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize]))

    plt.show()
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps,
         vispolicy, objType, numOrientations, useRotHierarchy,
         useHandCodeHierarchy):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Create environment and set two stride parameters (stride-x and stride-y)
    # for this problem instance. Most of the time, the two stride parameters will be equal.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride  # stride for initial puck placement
    env.stride = envStride  # stride for action specification
    env.blockType = objType
    env.num_orientations = numOrientations
    env.reset()

    # Standard q-learning parameters
    reuseModels = None
    max_timesteps = inputmaxtimesteps
    exploration_fraction = 0.75
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 60
    buffer_size = 10000
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1

    # SGD learning rate
    lr = 0.0003

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (
        env.blockSize * 3, env.blockSize * 3, 2
    )  # size of patch descriptor relative to number of "blocks" on board (each block is a 28x28 region)
    descriptorShapeSmall = (
        25, 25, 2
    )  # size to which each patch gets resized to. Code runs faster w/ smaller sizes, but could miss detail needed to solve the problem.
    num_discrete_states = 2  # number of discrete states: either holding or not
    num_patches = len(
        env.moveCenters
    )**2  # env.moveCenters is num of patches along one side of image
    num_actions = num_discrete_states * num_patches * env.num_orientations  # total actions = num discrete states X num non-rotated descriptor patches X num of orientations per patch location

    # e-greedy exploration schedule. I find that starting at e=50% helps curriculum learning "remember" what was learned in the prior run.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=0.5,
                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by
    # setting the line below to False
    prioritized_replay = True
    #    prioritized_replay=False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)],
                               hiddens=[64],
                               dueling=True)

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)
    getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride,
        numOrientations=numOrientations)

    getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                   q_func=q_func,
                                   num_discrete_states=num_discrete_states,
                                   num_cascade=5,
                                   scope="deepq",
                                   qscope="q_func_notholding_rot",
                                   reuse=reuseModels)
    getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                q_func=q_func,
                                num_discrete_states=num_discrete_states,
                                num_cascade=5,
                                scope="deepq",
                                qscope="q_func_holding_rot",
                                reuse=reuseModels)

    targetTrainNotHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(
            learning_rate=lr / 2.),  # rotation learns slower than norot
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot
        scope="deepq",
        qscope="q_func_notholding_rot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(
            learning_rate=lr / 2.),  # rotation learns slower than norot
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot
        scope="deepq",
        qscope="q_func_holding_rot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                     q_func=q_func,
                                     num_discrete_states=num_discrete_states,
                                     num_cascade=5,
                                     scope="deepq",
                                     qscope="q_func_notholding_norot",
                                     reuse=reuseModels)
    getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                  q_func=q_func,
                                  num_discrete_states=num_discrete_states,
                                  num_cascade=5,
                                  scope="deepq",
                                  qscope="q_func_holding_norot",
                                  reuse=reuseModels)

    targetTrainNotHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding_norot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding_norot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([
        2,
    ])

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):

        # Get NoRot descriptors. Each x-y position gets one descriptor patch in
        # a single orientation. Encode pick/place using a stack of two image channels.
        # Pick actions are denoted by the patch in channel 0 and zeros in channel 1.
        # Place actions have zeros in channel 0 and the patch in channel 1.
        # Each elt of actionDescriptorsNoRot is a pick/place action to a specific
        # position with orientation left unspecified.
        moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
        moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
        actionsPickDescriptorsNoRot = np.stack(
            [moveDescriptorsNoRot,
             np.zeros(np.shape(moveDescriptorsNoRot))],
            axis=3)
        actionsPlaceDescriptorsNoRot = np.stack(
            [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot],
            axis=3)
        actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot,
                                       actionsPlaceDescriptorsNoRot]

        # If useHandCodeHierarchy == 1, we exclude patches that are completely zero
        if useHandCodeHierarchy == 1:
            nonZeroMoves = np.sum(np.sum(moveDescriptorsNoRot > 0, -1), -1) > 0
            movesCandidates = np.nonzero(nonZeroMoves)[0]
            actionsCandidates = []
            for jj in range(0, num_discrete_states):
                for ii in range(0, env.num_orientations):
                    actionsCandidates = np.r_[actionsCandidates,
                                              movesCandidates +
                                              ii * env.num_moves +
                                              jj * env.num_orientations *
                                              env.num_moves]
            actionsCandidatesHandCodeHierarchy = np.int32(actionsCandidates)
            movesCandidatesHandCodeHierarchy = np.int32(movesCandidates)

        else:
            actionsCandidatesHandCodeHierarchy = range(
                num_discrete_states * env.num_moves * env.num_orientations)
            movesCandidatesHandCodeHierarchy = range(env.num_moves)

        # If useRotHierarchy == 1, we evaluate the Q function using a two-level hierarchy.
        # The first level (getq<Not>HoldingNoRot) is position but no rotation.
        # The second level (getq<Not>HoldingRot) is both position and orientation.
        # Specifically, we evaluate getq<Not>HoldingRot only for the top 20% of positions
        # found using getq<Not>HoldingNoRot.
        if useRotHierarchy == 1:

            # Get NoRot values
            if obs[1] == 0:
                qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
                qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
            elif obs[1] == 1:
                qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
                qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
            else:
                print("error: state out of bounds")
            qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace])
            qCurrNoRotIdx = np.r_[movesCandidatesHandCodeHierarchy,
                                  env.num_moves +
                                  movesCandidatesHandCodeHierarchy]

            # Get Rot actions corresponding to top k% NoRot actions
            k = 0.2  # top k% of NoRot actions
            #            k=0.1 # DEBUG: TRYING TO VISUALIZE AND RAN OUT OF MEM ON LAPTOP...
            valsNoRot = qCurrNoRot
            topKactionsNoRot = np.argsort(
                valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):]
            topKpositionsNoRot = qCurrNoRotIdx[topKactionsNoRot] % env.num_moves
            topKpickplaceNoRot = qCurrNoRotIdx[topKactionsNoRot] / env.num_moves
            actionsCandidates = []
            for ii in range(2):
                eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii]
                for jj in range(env.num_orientations):
                    actionsCandidates = np.r_[
                        actionsCandidates, eltsPos + jj * env.num_moves + ii *
                        (env.num_moves * env.num_orientations)]
            actionsCandidatesRotHierarchy = np.int32(actionsCandidates)

        # No rot hierarchy
        else:
            actionsCandidatesRotHierarchy = range(
                num_discrete_states * env.num_moves * env.num_orientations)

        # Intersect two types of hierarchy and get final list of actions to consider
        actionsCandidates = np.intersect1d(actionsCandidatesRotHierarchy,
                                           actionsCandidatesHandCodeHierarchy)

        # Get all patch descriptors (position + rotation)
        moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]])
        moveDescriptorsRot = moveDescriptorsRot * 2 - 1
        actionsPickDescriptorsRot = np.stack(
            [moveDescriptorsRot,
             np.zeros(np.shape(moveDescriptorsRot))],
            axis=3)
        actionsPlaceDescriptorsRot = np.stack(
            [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot],
            axis=3)
        actionDescriptorsRot = np.r_[actionsPickDescriptorsRot,
                                     actionsPlaceDescriptorsRot]

        # Get qCurr for selected actions, i.e. actions contained in actionCandidates
        actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates]
        if obs[1] == 0:
            qCurrReduced = np.squeeze(
                getqNotHoldingRot(actionDescriptorsRotReduced))
        elif obs[1] == 1:
            qCurrReduced = np.squeeze(
                getqHoldingRot(actionDescriptorsRotReduced))
        else:
            print("error: state out of bounds")
        qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0])
        qCurr[actionsCandidates] = np.copy(qCurrReduced)

        # Update tabular state-value function using V(s) = max_a Q(s,a)
        thisStateValues = np.max(qCurr)
        V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues

        #        # Select e-greedy action to execute
        #        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        #        action = np.argmax(qCurrNoise)
        #        if (np.random.rand() < exploration.value(t)) and not vispolicy:
        #            action = np.random.randint(num_actions)

        # e-greedy + softmax action selection
        qCurrExp = np.exp(qCurr / 0.1)
        probs = qCurrExp / np.sum(qCurrExp)
        action = np.random.choice(range(np.size(probs)), p=probs)
        if (np.random.rand() < exploration.value(t)) and not vispolicy:
            action = np.random.randint(num_actions)

        # factor action into position, orientation, pick-or-place
        position = action % env.num_moves
        pickplace = action / (env.num_moves * env.num_orientations)
        orientation = (action - pickplace * env.num_moves *
                       env.num_orientations) / env.num_moves
        actionNoRot = position + pickplace * env.num_moves

        if vispolicy:
            print("action: " + str(action))
            print("position: " + str(position))
            print("pickplace: " + str(pickplace))
            print("orientation: " + str(orientation))
            plt.subplot(1, 2, 1)
            plt.imshow(env.state[0][:, :, 0])
            sp.misc.imsave('temp1.png', env.state[0][:, :, 0])

        # Execute action
        new_obs, rew, done, _ = env.step(action)

        # Add to buffer
        replay_buffer.add(cp.copy(obs[1]),
                          np.copy(actionDescriptorsNoRot[actionNoRot, :]),
                          np.copy(actionDescriptorsRot[action, :]),
                          cp.copy(rew), cp.copy(new_obs[1]),
                          cp.copy(float(done)))

        # If vispolicy==True, then visualize policy
        if vispolicy:
            print("rew: " + str(rew))
            print("done: " + str(done))
            plt.subplot(1, 2, 2)
            plt.imshow(env.state[0][:, :, 0])
            plt.show()
            sp.misc.imsave('temp2.png', env.state[0][:, :, 0])

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Calculate target
            targets = rewards + (1 - dones) * gamma * V[states_tp1]

            # Get current q-values and calculate td error and q-value targets
            qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot)
            qCurrTargetHolding = getqHoldingRot(actionPatchesRot)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets

            # Train
            targetTrainNotHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 0],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 1],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))

            targetTrainNotHoldingNoRot(
                actionPatchesNoRot,
                np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHoldingNoRot(
                actionPatchesNoRot,
                np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))

            # Update replay priorities using td_error
            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = np.copy(new_obs)

    # save learning curve
    filename = 'PA18_deictic_rewards.dat'
    np.savetxt(filename, episode_rewards)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV, V)

    # Display value function from this run
    obs = env.reset()

    moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
    moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptorsNoRot,
         np.zeros(np.shape(moveDescriptorsNoRot))],
        axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot],
        axis=3)
    qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors)
    qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors)
    qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot],
                                axis=1)
    qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors)
    qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors)
    qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot],
                                 axis=1)

    moveDescriptors = getMoveActionDescriptorsRot([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
    qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors)
    qPickHolding = getqHoldingRot(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors)
    qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)

    gridSize = len(env.moveCenters)
    print("Value function for pick action in hold-0 state:")
    print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize])))
    for ii in range(env.num_orientations):
        print("Value function for pick action for rot" + str(ii) +
              " in hold-0 state:")
        print(
            str(
                np.reshape(
                    qPick[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0],
                    [gridSize, gridSize])))

    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize])))
    for ii in range(env.num_orientations):
        print("Value function for place action for rot" + str(ii) +
              " in hold-1 state:")
        print(
            str(
                np.reshape(
                    qPlace[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0],
                    [gridSize, gridSize])))
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
#        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys])
        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_actions) for x in keys])
    
#    def trainTabular(vectorKey,qCurrTargets,weights):
    def trainTabular(vectorKey,qCurrTargets,weights):
        keys = getTabularKeys(vectorKey)
        alpha=0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
#                q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]


    env = envstandalone.BlockArrange()

    # Standard q-learning parameters
    max_timesteps=30000
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=10
    buffer_size=1
    batch_size=1
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
    
    fullImageSize = (env.maxSide,env.maxSide,1)

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay=False
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
        convs=[(16,3,1), (32,3,1)],
        hiddens=[48],
        dueling=True
    )

    def make_fullImage_ph(name):
        return U.BatchInput(fullImageSize, name=name)
    def make_target_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)
    def make_weight_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)

    if valueFunctionType == 'DQN':
        
        getqFullStateNotHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_notholding",
            reuse=None
        )
        getqFullStateHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_holding",
            reuse=None
        )
        
        targetTrainFullStateNotHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_fullstate_notholding",
            grad_norm_clipping=None,
            reuse=None
        )
        targetTrainFullStateHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_fullstate_holding",
            grad_norm_clipping=None,
            reuse=None
        )


    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    
    for t in range(max_timesteps):

        # Get qCurr values
        if valueFunctionType == "TABULAR":
            stateDescriptorsFlat = np.reshape(obs[0],[-1,env.maxSide**2]) == 1
            stateDescriptorsFlat = np.array([np.concatenate([[obs[1]==1],stateDescriptorsFlat[0]])])
            qCurr = getTabular(stateDescriptorsFlat)[0]
        else:
            if obs[1]:
                qCurr = getqFullStateHolding([obs[0]])
            else:
                qCurr = getqFullStateNotHolding([obs[0]])
                
        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done
        replay_buffer.add(np.copy(obs[0]), np.copy(obs[1]), np.copy(action), np.copy(rew), np.copy(new_obs[0]), np.copy(new_obs[1]), np.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample(batch_size)
            weights, batch_idxes = np.ones_like(rewards), None
            
            if valueFunctionType == "TABULAR":
                stateDescriptorsNextFlat = np.reshape(states_images_tp1,[-1,env.maxSide**2]) == 1
                qNextNotHolding = getTabular(np.c_[np.tile(False,[batch_size,1]),stateDescriptorsNextFlat])
                qNextHolding = getTabular(np.c_[np.tile(True,[batch_size,1]),stateDescriptorsNextFlat])
            else:
                qNextNotHolding = getqFullStateNotHolding(states_images_tp1)
                qNextHolding = getqFullStateHolding(states_images_tp1)
            
            qNext = np.stack([qNextNotHolding,qNextHolding],axis=2)
            qNextmax = np.max(qNext[range(batch_size),:,states_discrete_tp1],axis=1)
            targets = rewards + (1-dones) * gamma * qNextmax

            if valueFunctionType == "TABULAR":
                stateDescriptorsFlatBatch = np.reshape(states_images_t,[-1,env.maxSide**2]) == 1
                stateDescriptorsNotHoldingFlat = np.c_[np.tile(False,[batch_size,1]),stateDescriptorsFlatBatch]
                stateDescriptorsHoldingFlat = np.c_[np.tile(True,[batch_size,1]),stateDescriptorsFlatBatch]
                qCurrNotHoldingBatch = getTabular(stateDescriptorsNotHoldingFlat)
                qCurrHoldingBatch = getTabular(stateDescriptorsHoldingFlat)
            else:
                qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t)
                qCurrHoldingBatch = getqFullStateHolding(states_images_t)

            qCurrTargetBatch = np.stack([qCurrNotHoldingBatch,qCurrHoldingBatch],axis=2)
            qCurrTargetBatch[range(batch_size),actions,states_discrete_t] = targets

            if valueFunctionType == "TABULAR":
                trainTabular(stateDescriptorsNotHoldingFlat,qCurrTargetBatch[:,:,0],np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions]))
                trainTabular(stateDescriptorsHoldingFlat,qCurrTargetBatch[:,:,1],np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions]))
            else:
                targetTrainFullStateNotHolding(states_images_t, qCurrTargetBatch[:,:,0], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions]))
                targetTrainFullStateHolding(states_images_t, qCurrTargetBatch[:,:,1], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions]))


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
#        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = copy.deepcopy(new_obs) # without this deepcopy, RL totally fails...


    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    if valueFunctionType == "TABULAR":
        qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    else:
        qPickNotHolding = getqNotHolding(actionsPickDescriptors)
        qPickHolding = getqHolding(actionsPickDescriptors)
        qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    if valueFunctionType == "TABULAR":
        qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    else:
        qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
        qPlaceHolding = getqHolding(actionsPlaceDescriptors)
        qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)    
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
Exemple #11
0
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Create environment and set stride parameters for this problem instance.
    # Most of the time, these two stride parameters will be equal. However,
    # one might use a smaller stride for initial placement and a larger stride
    # for action specification in order to speed things up. Unfortunately, this
    # could cause the problem to be infeasible: no grasp might work for a given
    # initial setup.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride  # stride for initial puck placement
    env.stride = envStride  # stride for action specification

    # Standard q-learning parameters
    reuseModels = None
    max_timesteps = inputmaxtimesteps
    exploration_fraction = 0.5
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 60
    buffer_size = 1000
    batch_size = 32
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2)
    #    descriptorShapeSmall = (10,10,2)
    #    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20, 20, 2)
    num_states = 2  # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2 * num_patches

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    #    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
    #                                 initial_p=exploration_final_eps,
    #                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by
    # setting the line below to False
    prioritized_replay = True
    #    prioritized_replay=False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True)

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)

    getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                q_func=q_func,
                                num_states=num_states,
                                num_cascade=5,
                                scope="deepq",
                                qscope="q_func_notholding",
                                reuse=reuseModels)
    getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                             q_func=q_func,
                             num_states=num_states,
                             num_cascade=5,
                             scope="deepq",
                             qscope="q_func_holding",
                             reuse=reuseModels)

    targetTrainNotHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([
        2,
    ])

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors * 2 - 1
        actionsPickDescriptors = np.stack(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.stack(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        # Get qCurr. I split up pick and place in order to accomodate larger batches
        qCurrNotHoldingPick = getqNotHolding(actionsPickDescriptors)
        qCurrHoldingPick = getqHolding(actionsPickDescriptors)
        qCurrNotHoldingPlace = getqNotHolding(actionsPlaceDescriptors)
        qCurrHoldingPlace = getqHolding(actionsPlaceDescriptors)
        qCurr = np.concatenate([
            np.r_[qCurrNotHoldingPick, qCurrNotHoldingPlace],
            np.r_[qCurrHoldingPick, qCurrHoldingPlace]
        ],
                               axis=1)

        # Update tabular state-value function using V(s) = max_a Q(s,a)
        thisStateValues = np.max(qCurr[:, obs[1]])
        V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues

        # Select e-greedy action to execute
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, obs[1]])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # Execute action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(cp.copy(obs[1]),
                          np.copy(actionDescriptors[action, :]), cp.copy(rew),
                          cp.copy(new_obs[1]), cp.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatches, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatches, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Calculate target
            targets = rewards + (1 - dones) * gamma * V[states_tp1]

            # Get current q-values and calculate td error and q-value targets
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets

            # Train
            targetTrainNotHolding(
                actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHolding(actionPatches,
                               np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                               np.reshape(weights, [batch_size, 1]))

            # Update replay priorities using td_error
            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
            #            print("time to do training: " + str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV, V)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))

    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [gridSize, gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize])))

    plt.subplot(1, 3, 1)
    plt.imshow(np.tile(env.state[0], [1, 1, 3]), vmin=5, vmax=12)
    plt.subplot(1, 3, 2)
    plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize]), vmin=5, vmax=12)
    plt.subplot(1, 3, 3)
    plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize]), vmin=5, vmax=12)
    plt.show()
Exemple #12
0
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,1)
    deicticShape = (3, 3, 2)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):

                #                # one-channel output
                #                deicticObsThis = obs[i:i+windowLen,j:j+windowLen,:]

                # two channel output
                deicticObsThis = np.zeros(deicticShape)
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 10
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 20

                deicticObs.append(deicticObsThis)

        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        shape = np.shape(deicticObsBatch)
        return (np.reshape(
            np.array(deicticObsBatch),
            [shape[0] * shape[1], shape[2], shape[3], shape[4]]))

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[16], dueling=True)

    #    # MLP version
    #    model = models.mlp([16, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)

#        # MLP version
#        return U.BatchInput([9], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade)

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        #        obsDeictic = getDeicticObs(obs)
        obsDeictic = getDeic([obs])

        # CNN version
        qCurr = getq(np.array(obsDeictic))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,9]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))
        selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: CNN version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,9]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,9]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade])

            qCurrTargets = np.copy(qCurr)

            #            # Copy into cascade without pruning
            #            for i in range(num_cascade):
            #                qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]

            # CNN version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)

            qCurrTargets


#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,9]),
#                    qCurrTargets
#                    )

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemple #13
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
#    env = gym.make("FrozenLake8x8rob-v0")
#    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")
    
    
    
    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obses_t, windowLen):
        deicticObses_t = []
        for i in range(np.shape(obses_t)[0] - windowLen):
            for j in range(np.shape(obses_t)[1] - windowLen):
                deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:])
        return np.array(deicticObses_t)

    # get set of deictic alternatives
    # input: batch x n x n x channels
    # output: (batch x deictic) x dn x dn x channels
    def getDeictic(obses_t, actions, obses_tp1, weights, windowLen):
        deicticObses_t = []
        deicticActions = []
        deicticObses_tp1 = []
        deicticWeights = []
        for i in range(np.shape(obses_t)[0]):
            for j in range(np.shape(obses_t)[1] - windowLen):
                for k in range(np.shape(obses_t)[2] - windowLen):
                    deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:])
                    deicticActions.append(actions[i])
                    deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:])
                    deicticWeights.append(weights[i])
        return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
#        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
#        hiddens=[256],  # used in pong
#        convs=[(8,4,1)], # used for non-deictic TestRob3-v0
        convs=[(4,3,1)], # used for deictic TestRob3-v0
        hiddens=[16],
        dueling=True
    )

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
#    max_timesteps=50000
    max_timesteps=20000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.
    target_network_update_freq=500
    prioritized_replay=False
#    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16
    
    deicticShape = (3,3,1)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(deicticShape, name=name)

    matchShape = (batch_size*25,)
    def make_match_ph(name):
        return U.BatchInput(matchShape, name=name)

    
    sess = U.make_session(num_cpu)
    sess.__enter__()

#    act, train, update_target, debug = build_graph.build_train(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min(
        make_obs_ph=make_obs_ph,
        make_match_ph=make_match_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):
        
        # get action to take
#        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
#        qvalues = getq(np.array(obs)[None])
#        action = np.argmax(qvalues)
#        if np.random.rand() < exploration.value(t):
#            action = np.random.randint(env.action_space.n)
        
        deicticObs = getDeicticObs(obs,3)
        qvalues = getq(np.array(deicticObs))
        action = np.argmax(np.max(qvalues,0))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)
        
#        # temporarily take uniformly random actions all the time
#        action = np.random.randint(env.action_space.n)
        
        new_obs, rew, done, _ = env.step(action)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Get batch
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            # Convert batch to deictic format
            obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3)
            
            obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])]
            _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True)
#            matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)]
            
#            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
#            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
def main():

    # ******* Deictic parameters ********

    # deicticShape is the shape of the patch that is used. For example, a 3,3,2 patch
    # is a 2-channel 3x3 patch. num_deictic_patches must be set to the number of deicticShape
    # patches in an entire image.
    # For example, there are 36 3x3 patches that are contained in an 8x8 observation space
    # (assuming no zero padding). You must set this number to correspond to deicticShape.

    #    deicticShape = (3,3,2)
    #    deicticShape = (3,3,4)
    deicticShape = (4, 4, 2)
    #    deicticShape = (4,4,4)

    #    num_deictic_patches = 36
    num_deictic_patches = 25

    # Desired network type. So far, I've done better w/ CNN
    WHICH_Q = "CNN"
    #    WHICH_Q = "MLP"

    # Method used to evaluate value of next state. So far, I've found that PAIRED_NEXT works
    # much better than MAX_NEXT. MAX_NEXT only works if you also set MIN_OVER_BATCH to True.
    # OW, it doesn't converge.
    # PAIRED_NEXT -> use value of corresponding patch on the next step
    # MAX_NEXT -> use max value over all next-step patches
    NEXT_PATCH = "PAIRED_NEXT"
    #    NEXT_PATCH = "MAX_NEXT"

    # If MIN_OVER_BATCH is true, then we find the min value over all targets that have
    # the same corresponding patch. In principle, this should always help. The larger
    # the batch size, the more it should help. However, in practice, I find that
    # it seems to cap the maximum achievable performance. On the other hand, it can
    # help convergence when using NEXT_PATCH = "MAX_NEXT".
    #    MIN_OVER_BATCH = True
    MIN_OVER_BATCH = False

    # If MIN_OR_AVG_Q is "MIN", then we use the minimum Q value as calculated via the cascade.
    # OW (if "AVG"), we use the standard expected value Q value. "MIN" should work. "AVG" is
    # equivalent to the standard DQN backup applied to the patches.
    # best here.
    MIN_OR_AVG_Q = "MIN"
    #    MIN_OR_AVG_Q = "AVG"

    # If true, ROTATION_AUGMENTATION augments the agent's experience with
    # rotated versions of the patches. I typically turn this off.
    #    ROTATION_AUGMENTATION = True
    ROTATION_AUGMENTATION = False

    # ******* Load the environment ********

    env = envstandalone.StandaloneEnv()
    obsShape = env.observation_space.shape
    num_actions = env.action_space.n

    # ******* Standard DQN parameters ********

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 1
    lr = 0.001
    batch_size = 32
    train_freq = 1
    num_cascade = 5  # number of Q-functions in the cascade used to estimate a minimum value for each s,a pair
    num_cpu = 16
    replay_buffer = ReplayBuffer(buffer_size)
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    if MIN_OR_AVG_Q == "MIN":
        minoravg = -1
    elif MIN_OR_AVG_Q == "AVG":
        minoravg = 0
    else:
        print("error")

    # ******* Create neural network model ********

    if WHICH_Q == "CNN":
        # conv model parameters: (num_outputs, kernel_size, stride)
        model = models.cnn_to_mlp(convs=[(32, 3, 1)],
                                  hiddens=[32],
                                  dueling=True)
        networkShapeOfObservation = [
            -1, deicticShape[0], deicticShape[1], deicticShape[2]
        ]
    elif WHICH_Q == "MLP":
        # MLP version
        #        model = models.mlp([8, 16])
        model = models.mlp([16, 32])
        #        model = models.mlp([32])
        #        model = models.mlp([])
        networkShapeOfObservation = [
            -1, deicticShape[0] * deicticShape[1] * deicticShape[2]
        ]
    else:
        print("WHICH_Q error: must select valid q-function")
    q_func = model

    # ******* Build tensorflow functions ********

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        if WHICH_Q == "CNN":
            return U.BatchInput(deicticShape, name=name)
        elif WHICH_Q == "MLP":
            return U.BatchInput(
                [deicticShape[0] * deicticShape[1] * deicticShape[2]],
                name=name)
        else:
            print("WHICH_Q error: must select valid q-function")

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func",
        grad_norm_clipping=1.)

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,
                                deicticShape=deicticShape)
    #    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    U.initialize()
    episode_rewards = [0.0]
    timerStart = time.time()
    for t in range(max_timesteps):

        # get q-values for current deictic patches
        obsDeictic = getDeic([obs])
        qCurr = getq(np.reshape(obsDeictic, networkShapeOfObservation))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, minoravg, :],
                                  0))  # USE CASCADE
        #        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape such that patches and batches are interleaved in the same column
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            #            # Get curr, next values: NO ROTATION-AUGMENTATION
            qNext = getq(np.reshape(obses_tp1_deic, networkShapeOfObservation))
            qCurr = getq(np.reshape(obses_t_deic, networkShapeOfObservation))

            #            # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS
            if ROTATION_AUGMENTATION:
                obses_t_deicRot1 = np.rot90(obses_t_deic, k=3, axes=(1, 2))
                obses_t_deicRot2 = np.rot90(obses_t_deic, k=2, axes=(1, 2))
                obses_t_deicRot3 = np.rot90(obses_t_deic, k=1, axes=(1, 2))
                obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1,
                                     obses_t_deicRot2, obses_t_deicRot3]
                obses_tp1_deicRot1 = np.rot90(obses_tp1_deic, k=3, axes=(1, 2))
                obses_tp1_deicRot2 = np.rot90(obses_tp1_deic, k=2, axes=(1, 2))
                obses_tp1_deicRot3 = np.rot90(obses_tp1_deic, k=1, axes=(1, 2))
                obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1,
                                       obses_tp1_deicRot2, obses_tp1_deicRot3]
                qCurr = getq(np.array(obses_t_deic))
                qNext = getq(np.array(obses_tp1_deic))
                actionsTiled = np.r_[actionsTiled, actionsTiled + 1,
                                     actionsTiled + 2, actionsTiled + 3]
                actionsTiled = actionsTiled - 4 * (actionsTiled > 3)
                rewardsTiled = np.r_[rewardsTiled, rewardsTiled, rewardsTiled,
                                     rewardsTiled]
                donesTiled = np.r_[donesTiled, donesTiled, donesTiled,
                                   donesTiled]

            # Get value of next state
            if NEXT_PATCH == "PAIRED_NEXT":
                qNextmax = np.max(qNext[:, minoravg, :], 1)  # standard
            elif NEXT_PATCH == "MAX_NEXT":
                qNextTiled = np.reshape(qNext[:, minoravg, :],
                                        [-1, num_deictic_patches, num_actions])
                qNextmax = np.repeat(np.max(np.max(qNextTiled, 2), 1),
                                     num_deictic_patches)
            else:
                print("error")

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            # Take min over targets in same group
            if MIN_OVER_BATCH:
                obses_t_deic_reshape = np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])
                unique_deic, uniqueIdx, uniqueCounts = np.unique(
                    obses_t_deic_reshape,
                    return_inverse=True,
                    return_counts=True,
                    axis=0)
                for i in range(np.shape(uniqueCounts)[0]):
                    targets[uniqueIdx == i] = np.min(targets[uniqueIdx == i])

            # Copy into cascade with pruning.
            qCurrTargets = np.copy(qCurr)
            expLen = np.shape(qCurr)[0]
            qCurrTargets[range(expLen), 0, actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(expLen), i, actionsTiled]
                qCurrTargets[range(expLen),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled]

            td_error_out, obses_deic_out, targets_out = targetTrain(
                np.reshape(obses_t_deic, networkShapeOfObservation),
                qCurrTargets)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemple #15
0
def main():

    #    env = gym.make("CartPole-v0")
    #    env = gym.make("CartPole-v1")
    #    env = gym.make("Acrobot-v1")
    #    env = gym.make("MountainCar-v0")
    #    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))

    #    robShape = (2,)
    #    robShape = (3,)
    #    robShape = (200,)
    #    robShape = (16,)
    #    robShape = (64,)
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)
#        return U.BatchInput(robShape, name=name)

#    # these params are specific to mountaincar
#    def getOneHotObs(obs):
#        obsFraction = (obs[0] + 1.2) / 1.8
#        idx1 = np.int32(np.trunc(obsFraction*100))
#        obsFraction = (obs[1] + 0.07) / 0.14
#        idx2 = np.int32(np.trunc(obsFraction*100))
#        ident = np.identity(100)
#        return np.r_[ident[idx1,:],ident[idx2,:]]

# these params are specific to frozenlake

    def getOneHotObs(obs):
        #        ident = np.identity(16)
        ident = np.identity(64)
        return ident[obs, :]

#    model = models.mlp([32])
#    model = models.mlp([64])
#    model = models.mlp([64], layer_norm=True)
#    model = models.mlp([16, 16])

    model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                              hiddens=[256],
                              dueling=True)

    # parameters
    q_func = model
    #    lr=1e-3
    lr = 1e-4
    max_timesteps = 2000000
    #    max_timesteps=100000
    #    max_timesteps=50000
    #    buffer_size=50000
    buffer_size = 100000
    exploration_fraction = 0.1
    #    exploration_fraction=0.3
    exploration_final_eps = 0.01
    #    exploration_final_eps=0.02
    #    exploration_final_eps=0.1
    #    train_freq=1
    train_freq = 4
    batch_size = 32
    print_freq = 10
    checkpoint_freq = 10000
    #    learning_starts=1000
    learning_starts = 10000
    #    gamma=1.0
    gamma = 0.99
    #    target_network_update_freq=500
    target_network_update_freq = 1000
    #    prioritized_replay=False
    prioritized_replay = True
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    num_cpu = 16

    #    # try mountaincar w/ different input dimensions
    #    inputDims = [50,2]

    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    #    obs = np.r_[env.reset(),0]
    #    obs = getOneHotObs(obs)

    #    with tempfile.TemporaryDirectory() as td:
    model_saved = False
    #        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        #        new_obs = getOneHotObs(new_obs)
        #        new_obs = np.r_[new_obs,0]

        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            #            obs = getOneHotObs(obs)
            #            obs = np.r_[obs,0]
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                              weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:

            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)

        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))


#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess

    plt.plot(episode_rewards)
    plt.show()

    sess
Exemple #16
0
def main():

    #    env = gym.make("CartPoleRob-v0")
    #    env = gym.make("CartPole-v0")
    #    env = gym.make("CartPole-v1")
    #    env = gym.make("Acrobot-v1")
    #    env = gym.make("MountainCarRob-v0")
    #    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")
    #    env = gym.make("FrozenLake8x8rob-v0")
    #    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obses_t, windowLen):
        deicticObses_t = []
        for i in range(np.shape(obses_t)[0] - windowLen + 1):
            for j in range(np.shape(obses_t)[1] - windowLen + 1):
                deicticObses_t.append(obses_t[i:i + windowLen,
                                              j:j + windowLen, :])
        return np.array(deicticObses_t)

    # get set of deictic alternatives
    # input: batch x n x n x channels
    # output: (batch x deictic) x dn x dn x channels
    def getDeictic(obses_t, actions, obses_tp1, weights, windowLen):
        deicticObses_t = []
        deicticActions = []
        deicticObses_tp1 = []
        deicticWeights = []
        for i in range(np.shape(obses_t)[0]):
            for j in range(np.shape(obses_t)[1] - windowLen + 1):
                for k in range(np.shape(obses_t)[2] - windowLen + 1):
                    deicticObses_t.append(obses_t[i, j:j + windowLen,
                                                  k:k + windowLen, :])
                    deicticActions.append(actions[i])
                    deicticObses_tp1.append(obses_tp1[i, j:j + windowLen,
                                                      k:k + windowLen, :])
                    deicticWeights.append(weights[i])

        return np.array(deicticObses_t), np.array(deicticActions), np.array(
            deicticObses_tp1), np.array(deicticWeights)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
        #        hiddens=[256],  # used in pong
        #        convs=[(8,4,1)], # used for non-deictic TestRob3-v0
        #        convs=[(8,3,1)], # used for deictic TestRob3-v0
        convs=[(16, 3, 1)],  # used for deictic TestRob3-v0
        #        convs=[(4,3,1)], # used for deictic TestRob3-v0
        #        convs=[(16,3,1)], # used for deictic TestRob3-v0
        #        convs=[(8,2,1)], # used for deictic TestRob3-v0
        hiddens=[16],
        dueling=True)

    #    model = models.mlp([6])

    # parameters
    q_func = model
    lr = 1e-3
    #    lr=1e-4
    #    max_timesteps=100000
    #    max_timesteps=50000
    max_timesteps = 20000
    buffer_size = 50000
    #    exploration_fraction=0.1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    #    exploration_final_eps=0.005
    #    exploration_final_eps=0.1
    print_freq = 10
    checkpoint_freq = 10000
    learning_starts = 1000
    gamma = .98
    target_network_update_freq = 500
    prioritized_replay = False
    #    prioritized_replay=True
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    num_cpu = 16

    #    batch_size=32
    #    train_freq=1
    #    batch_size=64
    #    train_freq=2
    #    batch_size=128
    #    train_freq=4
    #    batch_size=256
    #    train_freq=4
    batch_size = 512
    train_freq = 8

    # deicticShape must be square.
    # These two parameters need to be consistent w/ each other.
    #    deicticShape = (2,2,1)
    #    num_deictic_patches=36
    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    #    deicticShape = (4,4,1)
    #    num_deictic_patches=25
    #    deicticShape = (5,5,1)
    #    num_deictic_patches=16
    #    deicticShape = (6,6,1)
    #    num_deictic_patches=9
    #    deicticShape = (7,7,1)
    #    num_deictic_patches=4
    #    deicticShape = (8,8,1)
    #    num_deictic_patches=1

    def make_obs_ph(name):
        #        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(deicticShape, name=name)

    matchShape = (batch_size * 25, )

    def make_match_ph(name):
        return U.BatchInput(matchShape, name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    #    act, train, update_target, debug = build_graph.build_train(
    #    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    #    getq, train, trainWOUpdate, debug = build_graph.build_train_deictic(
    #    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min(
        make_obs_ph=make_obs_ph,
        make_match_ph=make_match_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        batch_size=batch_size,
        num_deictic_patches=num_deictic_patches,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        double_q=False)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()

    #    with tempfile.TemporaryDirectory() as td:
    model_saved = False
    #        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # get action to take
        #        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        #        qvalues = getq(np.array(obs)[None])
        #        action = np.argmax(qvalues)
        #        if np.random.rand() < exploration.value(t):
        #            action = np.random.randint(env.action_space.n)

        deicticObs = getDeicticObs(obs, deicticShape[0])
        qvalues = getq(np.array(deicticObs))
        action = np.argmax(np.max(qvalues, 0))
        selPatch = np.argmax(np.max(qvalues, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

#        # temporarily take uniformly random actions all the time
#        action = np.random.randint(env.action_space.n)
#        env.render()

        new_obs, rew, done, _ = env.step(action)

        # display state, action, nextstate
        if t > 20000:
            toDisplay = np.reshape(new_obs, (8, 8))
            toDisplay[
                np.
                int32(np.floor_divide(selPatch, np.sqrt(num_deictic_patches))),
                np.int32(np.remainder(selPatch, np.sqrt(num_deictic_patches))
                         )] = 50
            print(
                "Current/next state. 50 denotes the upper left corner of the deictic patch."
            )
            print(str(toDisplay))


#        env.render()

# Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)
            if t > 20000:
                print("q-values:")
                print(str(qvalues))
                print("*** Episode over! ***\n\n")

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Convert batch to deictic format
            obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(
                obses_t, actions, obses_tp1, weights, deicticShape[0])

            obses_t_deic_fingerprints = [
                np.reshape(obses_t_deic[i],
                           [deicticShape[0] * deicticShape[1]])
                for i in range(np.shape(obses_t_deic)[0])
            ]
            _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,
                                               axis=0,
                                               return_index=True,
                                               return_inverse=True)
            #            matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)]

            #            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
            #            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
            #            debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
            #            debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            #            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            #            td_errors2, min_values_of_groups2, match_onehot2 = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)

            td_errors, min_values_of_groups, match_onehot = train(
                obses_t_deic, actions_deic, rewards, obses_tp1_deic,
                fingerprintMatch, dones, weights_deic)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:

            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)

        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))

            if t > learning_starts and t % train_freq == 0:
                group_counts = np.sum(match_onehot, 1)
                print(str(min_values_of_groups[min_values_of_groups < 1000]))
                #                print(str(min_values_of_groups2[min_values_of_groups2 < 1000]))
                print(str(group_counts[group_counts > 0]))

                # display one of most valuable deictic patches
                min_values_of_groups_trunc = min_values_of_groups[
                    min_values_of_groups < 1000]
                most_valuable_patches_idx = np.argmax(
                    min_values_of_groups_trunc)
                most_valuable_patches = obses_t_deic[fingerprintMatch ==
                                                     most_valuable_patches_idx]
                print(
                    str(np.reshape(most_valuable_patches[0],
                                   deicticShape[0:2])))
                print(
                    "value of most valuable patch: " +
                    str(min_values_of_groups_trunc[most_valuable_patches_idx]))
                print("sum group counts: " + str(np.sum(group_counts)))

    num2avg = 20
    rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg
    plt.plot(rListAvg)
    #    plt.plot(episode_rewards)
    plt.show()

    sess
Exemple #17
0
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps):

    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

#    env = envstandalone.BlockArrange() # DEBUG

    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride # stride for initial puck placement
    env.stride = envStride # stride for action specification
    
    # Standard q-learning parameters
    max_timesteps=inputmaxtimesteps
    exploration_fraction=1.0
#    exploration_fraction=0.5
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=10
#    buffer_size=1000
    buffer_size=10000 # increasing buffer size from 1k to 10k was important when I tried to go to the 25-action (5x5 grid) version
    batch_size=10
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    num_patches = len(env.moveCenters)**2 # DEBUG
#    num_patches = env.maxSide**2 # DEBUG
    num_actions = 2*num_patches
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"

#    fullImageSize = [60,60,1]
#    fullImageSize = [20,20,1]
    fullImageSize = [15,15,1]
#    fullImageSize = [12,12,1]
#    fullImageSize = [9,9,1]
#    fullImageSize = [3,3,1] # DEBUG

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay=False
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
        convs=[(16,3,1), (32,3,1)],
        hiddens=[48],
        dueling=True
    )

    def make_fullImage_ph(name):
        return U.BatchInput(fullImageSize, name=name)
    def make_target_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)
    def make_weight_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)

    if valueFunctionType == 'DQN':
        
        getqFullStateNotHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_notholding",
            reuse=None
        )
        getqFullStateHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_holding",
            reuse=None
        )
        
        targetTrainFullStateNotHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_fullstate_notholding",
            grad_norm_clipping=None,
            reuse=None
        )
        targetTrainFullStateHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_fullstate_holding",
            grad_norm_clipping=None,
            reuse=None
        )

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    
    for t in range(max_timesteps):

        # Get qCurr values
        imCurr = np.int32(np.reshape(spm.imresize(obs[0][:,:,0],fullImageSize),fullImageSize) > 1)
#        imCurr = obs[0] # DEBUG
        if obs[1]:
            qCurr = getqFullStateHolding([imCurr])
        else:
            qCurr = getqFullStateNotHolding([imCurr])

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # Execute action
        new_obs, rew, done, _ = env.step(action)
        imNext = np.int32(np.reshape(spm.imresize(new_obs[0][:,:,0],fullImageSize),fullImageSize) > 1)
#        imNext = new_obs[0] # DEBUG
        
        # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done
        replay_buffer.add(np.copy(imCurr), np.copy(obs[1]), np.copy(action), np.copy(rew), np.copy(imNext), np.copy(new_obs[1]), np.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample(batch_size)
            weights, batch_idxes = np.ones_like(rewards), None

            qNextNotHolding = getqFullStateNotHolding(states_images_tp1)
            qNextHolding = getqFullStateHolding(states_images_tp1)
            
            qNext = np.stack([qNextNotHolding,qNextHolding],axis=2)
            qNextmax = np.max(qNext[range(batch_size),:,states_discrete_tp1],axis=1)
            targets = rewards + (1-dones) * gamma * qNextmax

            qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t)
            qCurrHoldingBatch = getqFullStateHolding(states_images_t)

            qCurrTargetBatch = np.stack([qCurrNotHoldingBatch,qCurrHoldingBatch],axis=2)
            qCurrTargetBatch[range(batch_size),actions,states_discrete_t] = targets

            targetTrainFullStateNotHolding(states_images_t, qCurrTargetBatch[:,:,0], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions]))
            targetTrainFullStateHolding(states_images_t, qCurrTargetBatch[:,:,1], np.tile(np.reshape(weights,[batch_size,1]),[1,num_actions]))



        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            print("time to do training: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = copy.deepcopy(new_obs) # without this deepcopy, RL totally fails...
        
        
    # save learning curve
    filename = 'PA2_rewards_' +str(num_patches) + "_" + str(max_timesteps) + '.dat'
    np.savetxt(filename,episode_rewards)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV,V)
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps):

    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    # Create environment and set stride parameters for this problem instance.
    # Most of the time, these two stride parameters will be equal. However,
    # one might use a smaller stride for initial placement and a larger stride
    # for action specification in order to speed things up. Unfortunately, this
    # could cause the problem to be infeasible: no grasp might work for a given
    # initial setup.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride # stride for initial puck placement
    env.stride = envStride # stride for action specification
    
    # Standard q-learning parameters
    reuseModels = None
    max_timesteps=inputmaxtimesteps
#    exploration_fraction=1
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=60
    buffer_size=1000
#    buffer_size=1
    batch_size=10
#    batch_size=1
    target_network_update_freq=1
    train_freq=1
    print_freq=1
#    lr=0.0003
    lr=0.00005
    lrV=0.001

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (env.blockSize*3,env.blockSize*3,2)
#    descriptorShape = (env.blockSize*3,env.blockSize*3,3) # three channels includes memory
#    descriptorShapeSmall = (20,20,2)
    descriptorShapeSmall = (20,20,3) # three channels includes memory
    stateDescriptorShapeSmall = (20,20,1) # first two dimensions must be the same as descriptorShapeSmall
    num_states = 2 # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2*num_patches

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
#    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
#                                 initial_p=exploration_final_eps,
#                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by 
    # setting the line below to False
#    prioritized_replay=True
    prioritized_replay=False
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(
        convs=[(16,3,1),(32,3,1)],
        hiddens=[48],
        dueling=True
    )

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)
    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)
    def make_stateDeic_ph(name):
        return U.BatchInput(stateDescriptorShapeSmall, name=name)
    def make_target_ph(name):
        return U.BatchInput([1], name=name)
    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride)
    
    getqNotHolding = build_getq(
            make_actionDeic_ph=make_actionDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="q_func_notholding",
            reuse=reuseModels
            )
    getqHolding = build_getq(
            make_actionDeic_ph=make_actionDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="q_func_holding",
            reuse=reuseModels
            )
    getVNotHolding = build_getq(
            make_actionDeic_ph=make_stateDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="V_func_notholding",
            reuse=reuseModels
            )
    getVHolding = build_getq(
            make_actionDeic_ph=make_stateDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="V_func_holding",
            reuse=reuseModels
            )

    targetTrainNotHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_notholding",
        grad_norm_clipping=1.,
        reuse=reuseModels
    )
    targetTrainHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_holding",
        grad_norm_clipping=1.,
        reuse=reuseModels
    )
    targetTrainVNotHolding = build_targetTrain(
        make_actionDeic_ph=make_stateDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lrV),
        scope="deepq", 
        qscope="V_func_notholding",
        grad_norm_clipping=1.,
        reuse=reuseModels
    )
    targetTrainVHolding = build_targetTrain(
        make_actionDeic_ph=make_stateDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lrV),
        scope="deepq", 
        qscope="V_func_holding",
        grad_norm_clipping=1.,
        reuse=reuseModels
    )
    
    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([2,])
    
#    placeMemory = np.zeros([1, descriptorShapeSmall[0], descriptorShapeSmall[1], 1])
    placeMemory = np.zeros([descriptorShapeSmall[0], descriptorShapeSmall[1], 1])
    
    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    
    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):
        
        # Get qCurr
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        placeMemoryTiled = np.repeat([placeMemory],num_patches,axis=0)
        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors)), placeMemoryTiled[:,:,:,0]],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors, placeMemoryTiled[:,:,:,0]],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]
        qCurrNotHolding = getqNotHolding(actionDescriptors)
        qCurrHolding = getqHolding(actionDescriptors)
        qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1)

        # Select e-greedy action to execute
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:,obs[1]])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # Execute action
        new_obs, rew, done, _ = env.step(action)
        
        # if a block has just been placed, then update placeMemory
        if (obs[1] > 0) and (new_obs[1] == 0):
            placeMemory = np.reshape(actionDescriptors[action][:,:,1],[descriptorShapeSmall[0],descriptorShapeSmall[1],1])
        if done:
            placeMemory = np.zeros([descriptorShapeSmall[0], descriptorShapeSmall[1], 1])

        # Calculate target (placeMemory state)
        moveDescriptorsNext = getMoveActionDescriptors([new_obs[0]])
        moveDescriptorsNext = moveDescriptorsNext*2-1
        placeMemoryTiled = np.repeat([placeMemory],num_patches,axis=0)
        actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext)), placeMemoryTiled[:,:,:,0]],axis=3)
        actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext, placeMemoryTiled[:,:,:,0]],axis=3)
        actionDescriptorsNext = np.r_[actionsPickDescriptorsNext,actionsPlaceDescriptorsNext]
        qCurrNotHoldingNext = getqNotHolding(actionDescriptorsNext)
        qCurrHoldingNext = getqHolding(actionDescriptorsNext)
        qNext = np.concatenate([qCurrNotHoldingNext,qCurrHoldingNext],axis=1)

        targets = rew + (1-done) * gamma * np.max(qNext[:,new_obs[1]])
            
        # Get current q-values and calculate td error and q-value targets
        qCurrTarget = np.copy(qCurr)
        td_error = qCurrTarget[action,obs[1]] - targets
        qCurrTarget[action,obs[1]] = targets

        # Train
        targetTrainNotHolding(actionDescriptors, np.reshape(qCurrTarget[:,0],[num_actions,1]), np.ones([num_actions,1]))
        targetTrainHolding(actionDescriptors, np.reshape(qCurrTarget[:,1],[num_actions,1]), np.ones([num_actions,1]))



        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            print("time to do training: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = cp.deepcopy(new_obs)

    # save learning curve
    filename = 'PA2_deictic_rewards_' +str(num_patches) + "_" + str(max_timesteps) + '.dat'
    np.savetxt(filename,episode_rewards)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV,V)
Exemple #19
0
def train():

    logger.configure()
    set_global_seeds(args.seed)

    directory = os.path.join(
        args.log_dir,
        '_'.join([args.env,
                  datetime.datetime.now().strftime("%m%d%H%M")]))
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        ValueError("The directory already exists...", directory)
    json.dump(vars(args),
              open(os.path.join(directory, 'learning_prop.json'), 'w'))

    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = models.wrap_atari_dqn(env)

    nb_test_steps = args.nb_test_steps if args.nb_test_steps > 0 else None
    reload_path = args.reload_path if args.reload_path else None
    if args.record:
        env = Monitor(env, directory=directory)

    with tf.device(args.device):
        model = models.cnn_to_mlp(
            convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
            hiddens=[args.num_units] * args.num_layers,
            dueling=bool(args.dueling),
            init_mean=args.init_mean,
            init_sd=args.init_sd,
        )

        act, records = simple.learn(
            env,
            q_func=model,
            lr=args.learning_rate,
            lr_decay_factor=args.lr_decay_factor,
            lr_growth_factor=args.lr_growth_factor,
            max_timesteps=args.nb_train_steps,
            buffer_size=args.buffer_size,
            exploration_fraction=args.eps_fraction,
            exploration_final_eps=args.eps_min,
            train_freq=4,
            print_freq=1000,
            checkpoint_freq=int(args.nb_train_steps / 10),
            learning_starts=args.nb_warmup_steps,
            target_network_update_freq=args.target_update_freq,
            gamma=0.99,
            prioritized_replay=bool(args.prioritized),
            prioritized_replay_alpha=args.prioritized_replay_alpha,
            epoch_steps=args.nb_epoch_steps,
            alg=args.alg,
            noise=args.noise,
            gpu_memory=args.gpu_memory,
            varTH=args.varth,
            act_policy=args.act_policy,
            save_dir=directory,
            nb_test_steps=nb_test_steps,
            scope=args.scope,
            test_eps=args.test_eps,
            checkpoint_path=reload_path,
            init_t=args.init_t,
        )
        print("Saving model to model.pkl")
        act.save(os.path.join(directory, "model.pkl"))
    plot(records, directory)
    env.close()
def main(envStride, fileIn, fileOut, inputmaxtimesteps):

    reuseModels = None
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    env = envstandalone.PuckArrange()
    env.stride = envStride # stride input to this problem
    env.reset() # need to do the reset her in order to populate parameters
    
    # Standard q-learning parameters
#    max_timesteps=2000
    max_timesteps=inputmaxtimesteps
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=60
    buffer_size=1000
    batch_size=10
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
    descriptorShape = (env.blockSize*3,env.blockSize*3,2)
#    descriptorShapeSmall = (10,10,2)
#    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20,20,2)
    num_states = 2 # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

#    prioritized_replay=True
    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(16,3,1)],
        hiddens=[32],
#        convs=[(32,3,1)],
#        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride)
    
    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_notholding",
                reuse=reuseModels
                )
        getqHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_holding",
                reuse=reuseModels
                )
    
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_notholding",
            grad_norm_clipping=1.,
            reuse=reuseModels
        )

        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_holding",
            grad_norm_clipping=1.,
            reuse=reuseModels
        )
        
    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    
    # load prior model
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)

    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        qCurrNotHolding = getqNotHolding(actionDescriptors)
        qCurrHolding = getqHolding(actionDescriptors)
        qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext = moveDescriptorsNext*2-1

            actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3)
            actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3)
            actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention!
            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]])

            qNextNotHolding = getqNotHolding(actionDescriptorsNext)
            qNextHolding = getqHolding(actionDescriptorsNext)
            qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1)

            qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1)
            
            targets = rewards + (1-dones) * gamma * qNextmax
            
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1)

            td_error = qCurrTarget[range(batch_size),states_t] - targets
            qCurrTarget[range(batch_size),states_t] = targets

            targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
#        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1
    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[gridSize,gridSize])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[gridSize,gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[gridSize,gridSize])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[gridSize,gridSize])))
    
    plt.subplot(1,3,1)
    plt.imshow(np.tile(env.state[0],[1,1,3]))
    plt.subplot(1,3,2)
    plt.imshow(np.reshape(qPick[:,0],[gridSize,gridSize]))
    plt.subplot(1,3,3)
    plt.imshow(np.reshape(qPlace[:,1],[gridSize,gridSize]))
    plt.show()
Exemple #21
0
def main(max_timesteps):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    env = envstandalone.BlockArrange()

    # Standard q-learning parameters
    #    max_timesteps=30000
    #    exploration_fraction=0.3
    exploration_fraction = 1
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    buffer_size = 10000
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    # first two elts of deicticShape must be odd
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches
    #    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"

    fullImageSize = (env.maxSide, env.maxSide, 1)

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay = False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)],
                               hiddens=[48],
                               dueling=True)

    def make_fullImage_ph(name):
        return U.BatchInput(fullImageSize, name=name)

    def make_target_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)

    def make_weight_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)

    if valueFunctionType == 'DQN':

        getqFullStateNotHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_notholding",
            reuse=None)
        getqFullStateHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_holding",
            reuse=None)

        targetTrainFullStateNotHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_fullstate_notholding",
            grad_norm_clipping=None,
            reuse=None)
        targetTrainFullStateHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_fullstate_holding",
            grad_norm_clipping=None,
            reuse=None)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    for t in range(max_timesteps):

        # Get qCurr values
        if obs[1]:
            qCurr = getqFullStateHolding([obs[0]])
        else:
            qCurr = getqFullStateNotHolding([obs[0]])

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done
        replay_buffer.add(np.copy(obs[0]), np.copy(obs[1]), np.copy(action),
                          np.copy(rew), np.copy(new_obs[0]),
                          np.copy(new_obs[1]), np.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, batch_idxes = np.ones_like(rewards), None

            qNextNotHolding = getqFullStateNotHolding(states_images_tp1)
            qNextHolding = getqFullStateHolding(states_images_tp1)

            qNext = np.stack([qNextNotHolding, qNextHolding], axis=2)
            qNextmax = np.max(qNext[range(batch_size), :, states_discrete_tp1],
                              axis=1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t)
            qCurrHoldingBatch = getqFullStateHolding(states_images_t)

            qCurrTargetBatch = np.stack(
                [qCurrNotHoldingBatch, qCurrHoldingBatch], axis=2)
            qCurrTargetBatch[range(batch_size), actions,
                             states_discrete_t] = targets

            targetTrainFullStateNotHolding(
                states_images_t, qCurrTargetBatch[:, :, 0],
                np.tile(np.reshape(weights, [batch_size, 1]),
                        [1, num_actions]))
            targetTrainFullStateHolding(
                states_images_t, qCurrTargetBatch[:, :, 1],
                np.tile(np.reshape(weights, [batch_size, 1]),
                        [1, num_actions]))

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        #        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = copy.deepcopy(
            new_obs)  # without this deepcopy, RL totally fails...

    # save learning curve
    filename = 'BAR2_rewards_' + str(num_patches) + "_" + str(
        max_timesteps) + '.dat'
    np.savetxt(filename, episode_rewards)
Exemple #22
0
def main():

    #    env = envstandalone.BallCatch()
    env = envstandalone.MultiGhostEvade()
    #    env = envstandalone.GhostEvade()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    #    exploration_fraction=0.2
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,4)
    #    num_deictic_patches=36

    num_actions = env.action_space.n
    episode_rewards = [0.0]
    num_cpu = 16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,3,1)],
        convs=[(16, 2, 1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    #    lr=1e-3
    lr = 0.001

    def make_obs_ph(name):
        #        return U.BatchInput(deicticShape, name=name)
        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current q-values: neural network version
        qCurr = getq(np.array([obs]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise, 1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        #        # debug
        #        if t > 5000:
        #            print("obs:\n" + str(np.squeeze(obs)))
        #            print("qCurr:\n" + str(qCurr))
        #            print("action: " + str(action) + ", patch: " + str(selPatch))
        #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
        #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
        #            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            actions = np.int32(np.reshape(actions, [
                batch_size,
            ]))

            # Get curr, next values: neural network version
            qNext = getq(obses_tp1)
            qCurr = getq(obses_t)

            # Get targets
            qNextmax = np.max(qNext, 1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, i]

            # Update values: neural network version
            td_error_out, obses_out, targets_out = targetTrain(
                obses_t, qCurrTargets)

            td_error_pre = qCurr[range(batch_size), actions] - targets

            #            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            # neural network version
            qCurr = getq(obses_t)

            td_error_post = qCurr[range(batch_size), actions] - targets


#            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemple #23
0
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps,
         vispolicy):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Create environment and set stride parameters for this problem instance.
    # Most of the time, these two stride parameters will be equal. However,
    # one might use a smaller stride for initial placement and a larger stride
    # for action specification in order to speed things up. Unfortunately, this
    # could cause the problem to be infeasible: no grasp might work for a given
    # initial setup.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride  # stride for initial puck placement
    env.stride = envStride  # stride for action specification

    # Standard q-learning parameters
    reuseModels = None
    max_timesteps = inputmaxtimesteps
    exploration_fraction = 0.5
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 60
    buffer_size = 1000
    #    batch_size=32
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    #    useHierarchy = False
    useHierarchy = True

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2)
    #    descriptorShapeSmall = (10,10,2)
    #    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20, 20, 2)
    num_states = 2  # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2 * num_patches * env.num_orientations

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by
    # setting the line below to False
    #    prioritized_replay=True
    prioritized_replay = False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True)

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)
    getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)

    getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                   q_func=q_func,
                                   num_states=num_states,
                                   num_cascade=5,
                                   scope="deepq",
                                   qscope="q_func_notholding_rot",
                                   reuse=reuseModels)
    getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                q_func=q_func,
                                num_states=num_states,
                                num_cascade=5,
                                scope="deepq",
                                qscope="q_func_holding_rot",
                                reuse=reuseModels)

    targetTrainNotHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding_rot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding_rot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                     q_func=q_func,
                                     num_states=num_states,
                                     num_cascade=5,
                                     scope="deepq",
                                     qscope="q_func_notholding_norot",
                                     reuse=reuseModels)
    getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                  q_func=q_func,
                                  num_states=num_states,
                                  num_cascade=5,
                                  scope="deepq",
                                  qscope="q_func_holding_norot",
                                  reuse=reuseModels)

    targetTrainNotHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding_norot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding_norot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([
        2,
    ])

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):

        # Use hierarchy to get candidate actions
        if useHierarchy:

            # Get NoRot descriptors
            moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
            moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
            actionsPickDescriptorsNoRot = np.stack([
                moveDescriptorsNoRot,
                np.zeros(np.shape(moveDescriptorsNoRot))
            ],
                                                   axis=3)
            actionsPlaceDescriptorsNoRot = np.stack([
                np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot
            ],
                                                    axis=3)
            actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot,
                                           actionsPlaceDescriptorsNoRot]

            # Get NoRot values
            if obs[1] == 0:
                qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot)
                qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot)
            elif obs[1] == 1:
                qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot)
                qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot)
            else:
                print("error: state out of bounds")
            qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace])

            # Get Rot actions corresponding to top k% NoRot actions
            k = 0.2  # top k% of NoRot actions
            valsNoRot = qCurrNoRot
            topKactionsNoRot = np.argsort(
                valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):]
            topKpositionsNoRot = topKactionsNoRot % env.num_moves
            topKpickplaceNoRot = topKactionsNoRot / env.num_moves
            actionsCandidates = []
            for ii in range(2):
                eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii]
                for jj in range(env.num_orientations):
                    actionsCandidates = np.r_[
                        actionsCandidates, eltsPos + jj * env.num_moves + ii *
                        (env.num_moves * env.num_orientations)]
            actionsCandidates = np.int32(actionsCandidates)

        # No hierarchy
        else:
            actionsCandidates = range(2 * env.num_moves * env.num_orientations)

        # Get Rot descriptors
        moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]])
        moveDescriptorsRot = moveDescriptorsRot * 2 - 1
        actionsPickDescriptorsRot = np.stack(
            [moveDescriptorsRot,
             np.zeros(np.shape(moveDescriptorsRot))],
            axis=3)
        actionsPlaceDescriptorsRot = np.stack(
            [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot],
            axis=3)
        actionDescriptorsRot = np.r_[actionsPickDescriptorsRot,
                                     actionsPlaceDescriptorsRot]

        # Get qCurr using actionCandidates
        actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates]
        if obs[1] == 0:
            qCurrReduced = np.squeeze(
                getqNotHoldingRot(actionDescriptorsRotReduced))
        elif obs[1] == 1:
            qCurrReduced = np.squeeze(
                getqHoldingRot(actionDescriptorsRotReduced))
        else:
            print("error: state out of bounds")
        qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0])
        qCurr[actionsCandidates] = np.copy(qCurrReduced)

        #        # Get qCurr. I split up pick and place in order to accomodate larger batches
        #        if obs[1] == 0:
        #            qCurrPick = getqNotHoldingRot(actionsPickDescriptorsRot)
        #            qCurrPlace = getqNotHoldingRot(actionsPlaceDescriptorsRot)
        #        elif obs[1] == 1:
        #            qCurrPick = getqHoldingRot(actionsPickDescriptorsRot)
        #            qCurrPlace = getqHoldingRot(actionsPlaceDescriptorsRot)
        #        else:
        #            print("error: state out of bounds")
        #        qCurr = np.squeeze(np.r_[qCurrPick,qCurrPlace])

        # Update tabular state-value function using V(s) = max_a Q(s,a)
        thisStateValues = np.max(qCurr)
        V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues

        #        # Select e-greedy action to execute
        #        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        #        action = np.argmax(qCurrNoise)
        #        if (np.random.rand() < exploration.value(t)) and not vispolicy:
        #            action = np.random.randint(num_actions)

        # e-greedy + softmax
        #        qCurrExp = np.exp(qCurr/0.3)
        qCurrExp = np.exp(qCurr / 0.2)
        #        qCurrExp = np.exp(qCurr/0.1)
        probs = qCurrExp / np.sum(qCurrExp)
        action = np.random.choice(range(np.size(probs)), p=probs)
        if (np.random.rand() < exploration.value(t)) and not vispolicy:
            action = np.random.randint(num_actions)

        position = action % env.num_moves
        pickplace = action / (env.num_moves * env.num_orientations)
        #        orientation = action / env.num_moves
        orientation = (action - pickplace * env.num_moves *
                       env.num_orientations) / env.num_moves
        actionNoRot = position + pickplace * env.num_moves

        if vispolicy:
            print("action: " + str(action))
            print("position: " + str(position))
            print("pickplace: " + str(pickplace))
            print("orientation: " + str(orientation))
            vposition = env.moveCenters[position / len(env.moveCenters)]
            hposition = env.moveCenters[position % len(env.moveCenters)]
            plt.subplot(1, 2, 1)
            im = env.state[0][:, :, 0]
            im[vposition, hposition] = 0.5
            plt.imshow(env.state[0][:, :, 0])
#            plt.show()

# Execute action
        new_obs, rew, done, _ = env.step(action)

        if useHierarchy:
            # store both NoRot and Rot descriptors
            replay_buffer.add(cp.copy(obs[1]),
                              np.copy(actionDescriptorsNoRot[actionNoRot, :]),
                              np.copy(actionDescriptorsRot[action, :]),
                              cp.copy(rew), cp.copy(new_obs[1]),
                              cp.copy(float(done)))
        else:
            # store only Rot descriptor
            replay_buffer.add(cp.copy(obs[1]),
                              np.copy(actionDescriptorsRot[action, :]),
                              np.copy(actionDescriptorsRot[action, :]),
                              cp.copy(rew), cp.copy(new_obs[1]),
                              cp.copy(float(done)))

        if vispolicy:
            print("rew: " + str(rew))
            print("done: " + str(done))
            plt.subplot(1, 2, 2)
            plt.imshow(env.state[0][:, :, 0])
            plt.show()

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Calculate target
            targets = rewards + (1 - dones) * gamma * V[states_tp1]

            # Get current q-values and calculate td error and q-value targets
            qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot)
            qCurrTargetHolding = getqHoldingRot(actionPatchesRot)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets

            # Train
            targetTrainNotHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 0],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 1],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))

            # Only train NoRot if we're doing the hierarchy
            if useHierarchy:

                #                qCurrTargetNotHoldingNoRot = getqNotHoldingNoRot(actionPatchesNoRot)
                #                qCurrTargetHoldingNoRot = getqHoldingNoRot(actionPatchesNoRot)
                #                qCurrTargetNoRot = np.concatenate([qCurrTargetNotHoldingNoRot,qCurrTargetHoldingNoRot],axis=1)
                #                idx = np.nonzero(np.int32(qCurrTargetNoRot[range(batch_size),states_t] > targets))
                #                targets[idx] = qCurrTargetNoRot[idx,states_t[idx]]

                targetTrainNotHoldingNoRot(
                    actionPatchesNoRot,
                    np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                    np.reshape(weights, [batch_size, 1]))
                targetTrainHoldingNoRot(
                    actionPatchesNoRot,
                    np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                    np.reshape(weights, [batch_size, 1]))

            # Update replay priorities using td_error
            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % exploration factor: " + str(int(100*explorationGaussianFactor.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))

            timerStart = timerFinal

        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV, V)

    # display value function
    obs = env.reset()

    moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
    moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptorsNoRot,
         np.zeros(np.shape(moveDescriptorsNoRot))],
        axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot],
        axis=3)
    qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors)
    qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors)
    qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot],
                                axis=1)
    qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors)
    qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors)
    qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot],
                                 axis=1)

    moveDescriptors = getMoveActionDescriptorsRot([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
    qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors)
    qPickHolding = getqHoldingRot(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors)
    qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)

    gridSize = len(env.moveCenters)
    print("Value function for pick action in hold-0 state:")
    print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize])))
    print("Value function for pick action for rot0 in hold-0 state:")
    print(str(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize])))
    print("Value function for pick action for rot1 in hold-0 state:")
    print(
        str(
            np.reshape(qPick[gridSize**2:2 * gridSize**2, 0],
                       [gridSize, gridSize])))
    print("Value function for pick action for rot2 in hold-0 state:")
    print(
        str(
            np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0],
                       [gridSize, gridSize])))
    print("Value function for pick action for rot3 in hold-0 state:")
    print(
        str(
            np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0],
                       [gridSize, gridSize])))

    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize])))
    print("Value function for place action for rot0 in hold-1 state:")
    print(str(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize])))
    print("Value function for place action for rot1 in hold-1 state:")
    print(
        str(
            np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1],
                       [gridSize, gridSize])))
    print("Value function for place action for rot2 in hold-1 state:")
    print(
        str(
            np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1],
                       [gridSize, gridSize])))
    print("Value function for place action for rot3 in hold-1 state:")
    print(
        str(
            np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1],
                       [gridSize, gridSize])))

    plt.subplot(2, 10, 1)
    plt.imshow(np.tile(env.state[0], [1, 1, 3]), interpolation=None)
    plt.subplot(2, 10, 2)
    plt.imshow(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 3)
    plt.imshow(np.reshape(qPick[gridSize**2:2 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 4)
    plt.imshow(np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 5)
    plt.imshow(np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 6)
    plt.imshow(np.reshape(qPick[4 * gridSize**2:5 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 7)
    plt.imshow(np.reshape(qPick[5 * gridSize**2:6 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 8)
    plt.imshow(np.reshape(qPick[6 * gridSize**2:7 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 9)
    plt.imshow(np.reshape(qPick[7 * gridSize**2:8 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 10)
    plt.imshow(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]),
               vmin=5,
               vmax=12)

    plt.subplot(2, 10, 12)
    plt.imshow(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 13)
    plt.imshow(np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 14)
    plt.imshow(np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 15)
    plt.imshow(np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 16)
    plt.imshow(np.reshape(qPlace[4 * gridSize**2:5 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 17)
    plt.imshow(np.reshape(qPlace[5 * gridSize**2:6 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 18)
    plt.imshow(np.reshape(qPlace[6 * gridSize**2:7 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 19)
    plt.imshow(np.reshape(qPlace[7 * gridSize**2:8 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 20)
    plt.imshow(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.show()
def main():

    env = envstandalone.TestRob3Env()
    
    max_timesteps=40000
    learning_starts=1000
    buffer_size=50000
#    buffer_size=1
    exploration_fraction=0.2
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
    batch_size=32
    train_freq=1

    obsShape = (8,8,1)
#    deicticShape = (3,3,1)
    deicticShape = (3,3,2)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu=16
    num_cascade = 5
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)


    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        convs=[(16,3,1)],
#        convs=[(16,2,1)],
        hiddens=[16],
        dueling=True
    )
    
    # MLP version
#    model = models.mlp([8, 16])
#    model = models.mlp([16, 16])
#    model = models.mlp([16, 32])
#    model = models.mlp([16, 16])
#    model = models.mlp([32, 32])

    q_func=model
    lr=0.001
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)
    
    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)
        
#        # MLP version
#        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func"
            )
    
    getqTarget = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func_target"
            )

    update_target = build_update_target(scope="deepq", 
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")
                      
    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func"
    )
    
    getDeic = build_getDeic(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()
    
    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])
        
        # CNN version
        qCurr = getq(np.array(obsDeictic))
        
#        # MLP version
#        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
#        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
            
            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones,num_deictic_patches)
            rewardsTiled = np.repeat(rewards,num_deictic_patches)
            actionsTiled = np.repeat(actions,num_deictic_patches)
            
            # Get curr, next values: CNN version
            qNextTarget = getqTarget(obses_tp1_deic)
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

#            # Get curr, next values: MLP version
#            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
#            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:,-1,:],1) # standard
#            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
#            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]
            
#            # This version takes the max over all glimpses
#            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
#            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax

#            # Take min over targets in same group
#            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
#            for i in range(np.shape(uniqueCounts)[0]):
#                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])
            
            
            qCurrTargets = np.copy(qCurr)
            
            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size*num_deictic_patches),0,actionsTiled] = targets
            for i in range(num_cascade-1):
                mask = targets < qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]
            
            # CNN version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                    obses_t_deic,
                    qCurrTargets
                    )
            
#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )
                
        # Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 20000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 4

    deicticShape = (3, 3, 4)
    num_deictic_patches = 36

    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    #                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])],
                    #                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])],
                    #                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]),
                                                    (k in patch[0:3, 3:6]),
                                                    (k in patch[0:3, 6:9])],
                                                   [(k in patch[3:6, 0:3]),
                                                    (k in patch[3:6, 3:6]),
                                                    (k in patch[3:6, 6:9])],
                                                   [(k in patch[6:9, 0:3]),
                                                    (k in patch[6:9, 3:6]),
                                                    (k in patch[6:9, 6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    # output: 8 x batch matrix of deictic observations
    def convertState(observations):

        # Reshape to batch x flatimage x channel.
        # Channel1 = zoomin agent, channel2 = zoomin ball
        # Channel3 = zoomout agent, channel4 = zoomout ball
        obs = np.zeros((36, 9, 4))
        for i in range(4):
            obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9])

        # state_numeric: 4 x batch.
        # row0: pos of agent in zoomin, row1: pos of ball in zoomin
        # row2: pos of agent in zoomout, row3: pos of ball in zoomout
        shape = np.shape(obs)
        state_numeric = 9 * np.ones(
            (4, shape[0])
        )  # 9 indicates agent/ball does not appear at this zoom in this glance
        pos = np.nonzero(obs == 1)
        for i in range(4):
            idx = np.nonzero(pos[2] == i)[0]
            state_numeric[i, pos[0][idx]] = pos[1][idx]
#            state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i]

        return np.int32(state_numeric)

    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return (np.array(state_numeric_batch))

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return (np.array(deicticObsBatch))

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,3,1)],
        convs=[(16, 2, 1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    lr = 1e-3

    def make_obs_ph(name):
        return U.BatchInput(deicticShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_cascaded(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_cascade=num_cascade,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    dimSize = deicticShape[0] * deicticShape[1] + 1
    tabularQ = 1 * np.ones(
        (dimSize, dimSize, dimSize, dimSize, num_cascade, num_actions))

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)

        #        # Get current q-values: tabular version
        #        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],-1,:]

        # Get current q-values: neural network version
        qCurr = getq(np.array(obsDeictic))[:, -1, :]

        # select action
        qCurrNoise = qCurr + np.random.random(
        ) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        #        # debug
        #        if t > 5000:
        #            print("obs:\n" + str(np.squeeze(obs)))
        #            print("qCurr:\n" + str(qCurr))
        #            print("action: " + str(action) + ", patch: " + str(selPatch))
        #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
        #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
        #            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            obs_resize_to_network = [
                batch_size * num_deictic_patches, deicticShape[0],
                deicticShape[1], deicticShape[2]
            ]
            q_resize_from_network = [
                batch_size, num_deictic_patches, num_cascade, num_actions
            ]
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            obses_t_deic = getDeicticObsBatch(obses_t)
            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            #            # Get curr, next values: tabular version
            #            stateNext = convertStateBatch(obses_tp1_deic)
            #            qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],-1,:]
            #            stateCurr = convertStateBatch(obses_t_deic)
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:]

            # Get curr, next values: neural network version
            qNext = np.reshape(
                getq(np.reshape(obses_tp1_deic, obs_resize_to_network)),
                q_resize_from_network)[:, :, -1, :]
            qCurr = np.reshape(
                getq(np.reshape(obses_t_deic, obs_resize_to_network)),
                q_resize_from_network)

            # Get "raw" targets (no masking for cascade levels)
            qNextmax = np.max(np.max(qNext, 2), 1)
            targetsRaw = rewards + (1 - dones) * gamma * qNextmax
            targetsTiled = np.tile(np.reshape(targetsRaw, [batch_size, 1, 1]),
                                   [1, num_deictic_patches, num_cascade])

            # Get qCurrActionSelect
            actionsTiled = np.tile(np.reshape(actions, [batch_size, 1, 1]),
                                   [1, num_deictic_patches, num_cascade])
            qCurrActionSelect = np.zeros(
                (batch_size, num_deictic_patches, num_cascade))
            for i in range(num_actions):
                qCurrActionSelect += (actionsTiled == i) * qCurr[:, :, :, i]

            # Get targets masked for cascade level
            targetMask = targetsTiled < qCurrActionSelect
            targets = np.zeros((batch_size, num_deictic_patches, num_cascade))
            targets[:, :, 0] = targetsTiled[:, :, 0]
            targets[:, :, 1] = targetMask[:, :, 0] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 0]) * qCurrActionSelect[:, :, 1]
            targets[:, :, 2] = targetMask[:, :, 1] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 1]) * qCurrActionSelect[:, :, 2]
            targets[:, :, 3] = targetMask[:, :, 2] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 2]) * qCurrActionSelect[:, :, 3]
            targets[:, :, 4] = targetMask[:, :, 3] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 3]) * qCurrActionSelect[:, :, 4]

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actionsTiled == i
                qCurrTargets[:, :, :, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, :, :, i]


#            # Update values: tabular version
#            tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] = \
#                (1 - learning_alpha) * tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] \
#                + learning_alpha * targets

# Update values: neural network version
            targets_resize_to_network = [
                batch_size * num_deictic_patches, num_cascade, num_actions
            ]
            td_error_out, obses_out, targets_out = targetTrain(
                np.reshape(obses_t_deic, obs_resize_to_network),
                np.reshape(qCurrTargets, targets_resize_to_network))

            td_error_pre = qCurrActionSelect - targets
            #            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            #            # tabular version
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:]

            # neural network version
            qCurr = np.reshape(
                getq(np.reshape(obses_t_deic, obs_resize_to_network)),
                q_resize_from_network)

            qCurrActionSelect_post = np.zeros(
                (batch_size, num_deictic_patches, num_cascade))
            for i in range(num_actions):
                qCurrActionSelect_post += (actionsTiled == i) * qCurr[:, :, :,
                                                                      i]

            td_error_post = qCurrActionSelect_post - targets
            #            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

            if -1 in rewards:
                dones

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    env = envstandalone.BlockArrange()

    # Standard q-learning parameters
    max_timesteps = 16000
    exploration_fraction = 0.3
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 100
    buffer_size = 1000
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    # first two elts of deicticShape must be odd
    actionShape = (3, 3, 3)
    memoryShape = (3, 3, 3)
    stateActionShape = (3, 3, 6)  # includes place memory
    num_states = 2  # either holding or not
    num_patches = env.maxSide**2
    num_actions_discrete = 3  # pick/place/look
    num_actions = num_actions_discrete * num_patches
    #    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
    #    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE"  # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    #    prioritized_replay=True
    prioritized_replay = False
    #    prioritized_replay_alpha=1.0
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    #    prioritized_replay_beta_iters=20000
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
        #    q_func = models.cnn_to_mlp_2pathways(
        #        convs=[(16,3,1), (32,3,1)],
        #        hiddens=[48],
        convs=[(32, 3, 1)],
        hiddens=[48],
        #        convs=[(48,3,1)],
        #        hiddens=[48],
        dueling=True)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_deic_ph(name):
        return U.BatchInput(stateActionShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=actionShape)

    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(make_deic_ph=make_deic_ph,
                                    q_func=q_func,
                                    num_states=num_states,
                                    num_cascade=5,
                                    scope="deepq",
                                    qscope="q_func_notholding")
        getqHolding = build_getq(make_deic_ph=make_deic_ph,
                                 q_func=q_func,
                                 num_states=num_states,
                                 num_cascade=5,
                                 scope="deepq",
                                 qscope="q_func_holding")

        targetTrainNotHolding = build_targetTrain(
            make_deic_ph=make_deic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_notholding",
            grad_norm_clipping=1.)

        targetTrainHolding = build_targetTrain(
            make_deic_ph=make_deic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_holding",
            grad_norm_clipping=1.)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = copy.deepcopy(env.reset())
    grid_t = obs[0]
    #    grid_t = np.int32(obs[0]>0)
    stateHolding_t = np.int32(obs[1] > 0)
    memory_t = np.zeros(
        [1, memoryShape[0], memoryShape[1],
         memoryShape[2]])  # first col is pick, second is place, third is look
    #    memory_t[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):

        # Get state/action descriptors
        moveDescriptors = getMoveActionDescriptors([grid_t])
        moveDescriptors[moveDescriptors == 0] = -1
        actionsPickDescriptors = np.stack([
            moveDescriptors,
            np.zeros(np.shape(moveDescriptors)),
            np.zeros(np.shape(moveDescriptors))
        ],
                                          axis=3)
        actionsPlaceDescriptors = np.stack([
            np.zeros(np.shape(moveDescriptors)), moveDescriptors,
            np.zeros(np.shape(moveDescriptors))
        ],
                                           axis=3)
        actionsLookDescriptors = np.stack([
            np.zeros(np.shape(moveDescriptors)),
            np.zeros(np.shape(moveDescriptors)), moveDescriptors
        ],
                                          axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors,
                                  actionsLookDescriptors]
        memoryTiled = np.repeat(memory_t,
                                num_patches * num_actions_discrete,
                                axis=0)
        stateActionDescriptors = np.concatenate(
            [actionDescriptors, memoryTiled], axis=3)

        # Get current values
        qCurrNotHolding = getqNotHolding(stateActionDescriptors)
        qCurrHolding = getqHolding(stateActionDescriptors)
        qCurr = np.concatenate([qCurrNotHolding, qCurrHolding], axis=1)

        # Select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:, stateHolding_t])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _, idx, inv = np.unique(actionDescriptors,
                                    axis=0,
                                    return_index=True,
                                    return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx, stateHolding_t])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv == actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # Take action
        new_obs, rew, done, _ = env.step(action)

        # Update state and memory
        grid_tp1 = new_obs[0]
        #        grid_tp1 = np.int32(new_obs[0]>0)
        stateHolding_tp1 = np.int32(new_obs[1] > 0)
        memory_tp1 = np.copy(memory_t)
        if (action < num_patches) and (stateHolding_tp1 !=
                                       0):  # if a block has been picked
            memory_tp1[:, :, :, 0] = np.reshape(
                stateActionDescriptors[action][:, :, 0],
                [1, stateActionShape[0], stateActionShape[1]])
        if (stateHolding_t > 0) and (stateHolding_tp1
                                     == 0):  # if a block has just been placed
            memory_tp1[:, :, :, 1] = np.reshape(
                stateActionDescriptors[action][:, :, 1],
                [1, stateActionShape[0], stateActionShape[1]])
        if action > num_patches * 2:  # if this is a look action
            #            memory_tp1[:,:,:,2] = np.reshape(stateActionDescriptors[action][:,:,2],[1,stateActionShape[0],stateActionShape[1]])
            #            memory_tp1[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG
            if (env.pickBlockGoal + 2) in stateActionDescriptors[action][:, :,
                                                                         2]:
                memory_tp1[0, :, :, 2] = (env.pickBlockGoal + 2) * np.ones(
                    [memoryShape[1], memoryShape[2]])

#        memory_tp1[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG

# Add to replay buffer
        replay_buffer.add(stateHolding_t, stateActionDescriptors[action, :],
                          rew, stateHolding_tp1, grid_tp1, memory_tp1[0], done)

        # Set tp1 equal to t
        stateHolding_t = stateHolding_tp1
        grid_t = grid_tp1
        memory_t = memory_tp1
        if done:
            memory_t = np.zeros(
                [1, memoryShape[0], memoryShape[1], memoryShape[2]])


#            memory_t[0,:,:,2] = (env.pickBlockGoal + 2) * np.ones([memoryShape[1], memoryShape[2]]) # DEBUG

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, placeMemory_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                statesDiscrete_t, stateActionsImage_t, rewards, statesDiscrete_tp1, grids_tp1, memories_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(grids_tp1)
            moveDescriptorsNext[moveDescriptorsNext == 0] = -1

            actionsPickDescriptorsNext = np.stack([
                moveDescriptorsNext,
                np.zeros(np.shape(moveDescriptorsNext)),
                np.zeros(np.shape(moveDescriptorsNext))
            ],
                                                  axis=3)
            actionsPlaceDescriptorsNext = np.stack([
                np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext,
                np.zeros(np.shape(moveDescriptorsNext))
            ],
                                                   axis=3)
            actionsLookDescriptorsNext = np.stack([
                np.zeros(np.shape(moveDescriptorsNext)),
                np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext
            ],
                                                  axis=3)
            actionDescriptorsNext = np.stack(
                [
                    actionsPickDescriptorsNext, actionsPlaceDescriptorsNext,
                    actionsLookDescriptorsNext
                ],
                axis=1
            )  # I sometimes get this axis parameter wrong... pay attention!
            actionDescriptorsNext = np.reshape(actionDescriptorsNext, [
                batch_size * num_patches * num_actions_discrete,
                actionShape[0], actionShape[1], actionShape[2]
            ])

            # Augment with state, i.e. place memory
            placeMemory_tp1_expanded = np.repeat(memories_tp1,
                                                 num_patches *
                                                 num_actions_discrete,
                                                 axis=0)
            actionDescriptorsNext = np.concatenate(
                [actionDescriptorsNext, placeMemory_tp1_expanded], axis=3)

            qNextNotHolding = getqNotHolding(actionDescriptorsNext)
            qNextHolding = getqHolding(actionDescriptorsNext)
            qNextFlat = np.concatenate([qNextNotHolding, qNextHolding], axis=1)

            qNext = np.reshape(
                qNextFlat,
                [batch_size, num_patches, num_actions_discrete, num_states])
            qNextmax = np.max(
                np.max(qNext[range(batch_size), :, :, statesDiscrete_tp1], 2),
                1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTargetNotHolding = getqNotHolding(stateActionsImage_t)
            qCurrTargetHolding = getqHolding(stateActionsImage_t)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)

            td_error = qCurrTarget[range(batch_size),
                                   statesDiscrete_t] - targets
            qCurrTarget[range(batch_size), statesDiscrete_t] = targets

            targetTrainNotHolding(
                stateActionsImage_t,
                np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHolding(stateActionsImage_t,
                               np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                               np.reshape(weights, [batch_size, 1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors[moveDescriptors == 0] = -1
    actionsPickDescriptorsOrig = np.stack([
        moveDescriptors,
        np.zeros(np.shape(moveDescriptors)),
        np.zeros(np.shape(moveDescriptors))
    ],
                                          axis=3)
    actionsLookDescriptorsOrig = np.stack([
        np.zeros(np.shape(moveDescriptors)),
        np.zeros(np.shape(moveDescriptors)), moveDescriptors
    ],
                                          axis=3)

    memoryZeros = np.zeros([1, memoryShape[0], memoryShape[1], memoryShape[2]])
    memoryLooked3 = np.zeros(
        [1, memoryShape[0], memoryShape[1], memoryShape[2]])
    memoryLooked3[0, :, :,
                  2] = 3 * np.ones([stateActionShape[0], stateActionShape[1]])
    memoryLooked4 = np.zeros(
        [1, memoryShape[0], memoryShape[1], memoryShape[2]])
    memoryLooked4[0, :, :,
                  2] = 4 * np.ones([stateActionShape[0], stateActionShape[1]])

    print("\nGrid configuration:")
    print(str(obs[0][:, :, 0]))

    for i in range(3):

        if i == 0:
            placeMemory = memoryZeros
            print("\nMemory has zeros:")
        elif i == 1:
            placeMemory = memoryLooked3
            print("\nMemory encodes look=3:")
        else:
            placeMemory = memoryLooked4
            print("\nMemory encodes look=4:")

        placeMemoryTiled = np.repeat(placeMemory, num_patches, axis=0)
        actionsPickDescriptors = np.concatenate(
            [actionsPickDescriptorsOrig, placeMemoryTiled], axis=3)
        actionsLookDescriptors = np.concatenate(
            [actionsLookDescriptorsOrig, placeMemoryTiled], axis=3)

        qPickNotHolding = getqNotHolding(actionsPickDescriptors)
        qLookNotHolding = getqNotHolding(actionsLookDescriptors)

        print("\nValue function for pick action in hold-nothing state:")
        print(str(np.reshape(qPickNotHolding[:, 0], [8, 8])))

        print("\nValue function for look action in hold-nothing state:")
        print(str(np.reshape(qLookNotHolding[:, 0], [8, 8])))
Exemple #27
0
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})
    
    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
#        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys])
    
    def trainTabular(vectorKey,qCurrTargets,weights):
        keys = getTabularKeys(vectorKey)
        alpha=0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
#                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]


    # Standard DQN parameters
#    max_timesteps=20000
    max_timesteps=30000
#    max_timesteps=2000
    learning_starts=1000
#    learning_starts=10
#    buffer_size=50000
    buffer_size=10000
#    buffer_size=1000
#    buffer_size=320
#    buffer_size=32
#    buffer_size=8
#    buffer_size=1
#    exploration_fraction=0.2
    exploration_fraction=0.3
#    exploration_final_eps=0.02
    exploration_final_eps=0.1
    print_freq=1
#    gamma=.98
    gamma=.9
    target_network_update_freq=1
    batch_size=32
#    batch_size=1
    train_freq=1
#    train_freq=2
    num_cpu = 16
#    lr=0.001
    lr=0.0003
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay=True
#    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1
    
    # Deictic state/action parameters
    deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3,3,2)
    num_cascade = 5
#    num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything
    num_states = 2 # either holding or not
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected
    
    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
        convs=[(16,3,1), (32,3,1)],
        hiddens=[48],
#        convs=[(32,3,1)],
#        hiddens=[32],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
#        return U.BatchInput([num_cascade,num_states], name=name)
        return U.BatchInput([num_states], name=name)

    def make_weight_ph(name):
        return U.BatchInput([num_states], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    if valueFunctionType == 'DQN':
        getq = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=num_cascade,
                scope="deepq",
                qscope="q_func"
                )
    
        targetTrain = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=num_cascade,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func",
            grad_norm_clipping=1.
    #        grad_norm_clipping=0.1
        )

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = np.int32(obs[1]>0) # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
        moveDescriptors = np.int32(moveDescriptorsRaw>0)
        moveDescriptors = moveDescriptors*2-1

        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        if valueFunctionType == "TABULAR":
            actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1
            qCurr = getTabular(actionDescriptorsFlat)
        else:
            qCurr = getq(actionDescriptors)
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly

        # select action at random
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,stateDeictic])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,stateDeictic])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # display state at the end
        if t > max_timesteps-200:
            print(str(obs[0][:,:,0]))
            print(str(obs[1]))
            print("action: " + str(action))

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        # display state at the end
        if (t > max_timesteps-200) and done:
            print("done *********************** done")
            
        replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            states_tp1 = np.int32(states_tp1>0)
            
            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0)
            moveDescriptorsNext1 = moveDescriptorsNext1*2-1

            actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3)
            actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3)
            actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0)
            actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]])
            
            if valueFunctionType == "TABULAR":
                actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1
                qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            else:
                qNextFlat1 = getq(actionDescriptorsNext1)
            
            qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1)
            targets1 = rewards + (1-dones) * gamma * qNextmax1

            if valueFunctionType == "TABULAR":
                actionsFlat = np.reshape(actions,[batch_size,-1]) == 1
                qCurrTarget1 = getTabular(actionsFlat)
            else:
                qCurrTarget1 = getq(actions)

            td_errors = qCurrTarget1[range(batch_size),states_t] - targets1
            qCurrTarget1[range(batch_size),states_t] = targets1

            if valueFunctionType == "TABULAR":
                trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR)
            else:
                targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
        
    # display value function
    obs = env.reset()
    moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
    moveDescriptors = np.int32(moveDescriptorsRaw>0)
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    qPick = getq(actionsPickDescriptors)
#    qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    qPlace = getq(actionsPlaceDescriptors)
#    qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
Exemple #28
0
def main():

    #    env = envstandalone.BallCatch()
    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    #    batch_size=1
    train_freq = 1

    obsShape = (8, 8, 1)
    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu = 16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :])
        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return (np.array(deicticObsBatch))

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return (np.array(state_numeric_batch))

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        convs=[(16, 3, 1)],
        #        convs=[(16,2,1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    #    lr=1e-3
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(deicticShape, name=name)
#        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    #    tabularQ = 100*np.ones([deicticShape[0]+1,deicticShape[1]+1,deicticShape[0]+1,deicticShape[1]+1, num_actions])
    tabularQ = 0 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeicticObs(obs)

        # get q: neural network
        qCurr = getq(np.array(obsDeictic))

        #        # get q: tabular
        #        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:
            #        if t > max_timesteps:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeicticObsBatch(obses_t)
            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            # Reshape everything to (1152,) form
            obs_resize_to_network = [
                batch_size * num_deictic_patches, deicticShape[0],
                deicticShape[1], deicticShape[2]
            ]
            obses_t_deic = np.reshape(obses_t_deic, obs_resize_to_network)
            obses_tp1_deic = np.reshape(obses_tp1_deic, obs_resize_to_network)
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: neural network version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: tabular version
            #            q_resize_from_network = [batch_size*num_deictic_patches,num_actions]
            #            stateNext = convertStateBatch(obses_tp1_deic)
            #            qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:]
            #            qNext = np.reshape(qNext,q_resize_from_network)
            #            stateCurr = convertStateBatch(obses_t_deic)
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:]
            #            qCurr = np.reshape(qCurr,q_resize_from_network)

            # Get "raw" targets (no masking for cascade levels)
            qNextmax = np.max(qNext, 1)
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            # Update values: neural network version
            qCurrTargets = np.copy(qCurr)
            qCurrTargets[range(batch_size * num_deictic_patches),
                         actionsTiled] = targets

            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)


#            # Update values: tabular version
#            stateCurrTiled = np.reshape(np.rollaxis(stateCurr,1),[num_actions,batch_size*num_deictic_patches])
#            tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] = \
#                (1 - learning_alpha) * tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] \
#                + learning_alpha * targets

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemple #29
0
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    env = envstandalone.BlockArrange()

    # Standard q-learning parameters
    max_timesteps=50000
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=10
    buffer_size=1
    batch_size=1
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
#    actionShape = (3,3,2)
    patchShape = (3,3,1)
    lookstackShape = (3,3,2)
    lookShape = (3,3,3)
    ppShape = (3,3,2)
#    num_states = 2 # either holding or not
    num_patches = env.maxSide**2
    num_actions_discrete = 2
    num_actions = num_patches + num_actions_discrete
    valueFunctionType = "DQN"
    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
#    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

#    prioritized_replay=True
    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(32,3,1)],
        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def displayLookStack(lookStack):
        np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})
        lookStack1 = str(lookStack[:,:,0])
        lookStack1 = np.core.defchararray.replace(lookStack1,".00","")
        lookStack1 = np.core.defchararray.replace(lookStack1,".","")
        lookStack1 = np.core.defchararray.replace(lookStack1,"0",".")
        lookStack2 = str(lookStack[:,:,1])
        lookStack2 = np.core.defchararray.replace(lookStack2,".00","")
        lookStack2 = np.core.defchararray.replace(lookStack2,".","")
        lookStack2 = np.core.defchararray.replace(lookStack2,"0",".")
        print("lookStack:")
        print(lookStack1)
        print(lookStack2)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_lookDeic_ph(name):
        return U.BatchInput(lookShape, name=name)

    def make_ppDeic_ph(name):
        return U.BatchInput(ppShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=lookShape)
    
    getqLookNotHolding = build_getq(
            make_deic_ph=make_lookDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_LookNotHolding"
            )
    getqLookHolding = build_getq(
            make_deic_ph=make_lookDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_LookHolding"
            )
    getqPPNotHolding = build_getq(
            make_deic_ph=make_ppDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_PPNotHolding"
            )
    getqPPHolding = build_getq(
            make_deic_ph=make_ppDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_PPHolding"
            )
    
    targetTrainLookNotHolding = build_targetTrain(
        make_deic_ph=make_lookDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_LookNotHolding",
        grad_norm_clipping=1.
    )
    targetTrainLookHolding = build_targetTrain(
        make_deic_ph=make_lookDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_LookHolding",
        grad_norm_clipping=1.
    )
    targetTrainPPNotHolding = build_targetTrain(
        make_deic_ph=make_ppDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_PPNotHolding",
        grad_norm_clipping=1.
    )
    targetTrainPPHolding = build_targetTrain(
        make_deic_ph=make_ppDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_PPHolding",
        grad_norm_clipping=1.
    )
        
    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    lookStack = np.zeros(lookstackShape)
    lookStackNext = np.zeros(lookstackShape)
    
    episode_rewards = [0.0]
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        moveDescriptors = np.reshape(moveDescriptors,[num_patches,patchShape[0],patchShape[1],patchShape[2]])
        looksStackTiled = np.tile(lookStack,[num_patches,1,1,1])
        lookDescriptors = np.concatenate([moveDescriptors,looksStackTiled],axis=3)
        
        if obs[1] == 0: # not holding
            qCurrLook = getqLookNotHolding(lookDescriptors)
            qCurrPP = np.r_[getqPPNotHolding([lookStack]),[[0]]]
        else: # holding
            qCurrLook = getqLookHolding(lookDescriptors)
            qCurrPP = np.r_[[[0]],getqPPHolding([lookStack])]
        qCurr = np.concatenate([qCurrLook,qCurrPP],axis=0)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise)
            if np.random.rand() < exploration.value(t):
                actionClass = np.random.randint(3)
                if actionClass == 0:
                    action = np.random.randint(num_patches)
                else:
                    action = np.random.randint(num_patches,num_patches+2)
#                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(lookDescriptors,axis=0,return_index=True,return_inverse=True)
            idx = np.r_[idx,num_patches,num_patches+1]
            actionIdx = np.argmax(qCurrNoise[idx])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            if actionIdx < len(idx)-2:
                actionsSelected = np.nonzero(inv==actionIdx)[0]
                action = actionsSelected[np.random.randint(len(actionsSelected))]
            else:
                action = idx[actionIdx]
        else:
            print("Error...")


        # take action
        new_obs, rew, done, _ = env.step(action)
        
        # If look action, then update look stack
        if action < num_patches:
            lookStackNext[:,:,1] = np.copy(lookStack[:,:,0])
            lookStackNext[:,:,0] = np.copy(moveDescriptors[action][:,:,0])
            lookAction = moveDescriptors[action]
            discreteAction = 0
        else:
            lookAction = np.zeros(patchShape)
            discreteAction = action - num_patches
        
        print("action: " + str(action))
        env.render()
        print("Reward: " + str(rew) + ", done: " + str(done))
        displayLookStack(lookStackNext)
        
        # discrete state, look state, discrete action, look action, reward, discrete next state, look next state, done
        replay_buffer.add(obs[1], lookStack, discreteAction, lookAction, rew, new_obs[1], lookStackNext, new_obs[0], float(done))
        
        lookStack = np.copy(lookStackNext)
        
        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                statesHolding_t, statesLookStack_t, actionsDiscrete, lookActions, rewards, statesHolding_tp1, statesLookStack_tp1, observations_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(observations_tp1)
            moveDescriptorsNext = moveDescriptorsNext*2-1
            moveDescriptorsNext = np.reshape(moveDescriptorsNext,[-1,patchShape[0],patchShape[1],patchShape[2]])
            looksStackNextTiled = np.repeat(statesLookStack_tp1,num_patches,axis=0)
            lookDescriptorsNext = np.concatenate([moveDescriptorsNext,looksStackNextTiled],axis=3)

            # calculate qNext
            qNextLookNotHolding = np.max(np.reshape(getqLookNotHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1)
            qNextLookHolding = np.max(np.reshape(getqLookHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1)
            qNextPPNotHolding = getqPPNotHolding(statesLookStack_tp1)
            qNextPPHolding = getqPPHolding(statesLookStack_tp1)
            qNextNotHolding = np.max(np.c_[qNextLookNotHolding,qNextPPNotHolding],axis=1)
            qNextHolding = np.max(np.c_[qNextLookHolding,qNextPPHolding],axis=1)
            qNext = np.stack([qNextNotHolding,qNextHolding],axis=1)

            targets = rewards + (1-dones) * gamma * qNext[range(batch_size),statesHolding_tp1]
            
            # Calculate qCurrTarget
            lookDescriptors = np.concatenate([lookActions,statesLookStack_t],axis=3)
            qCurrLookNotHoldingT = getqLookNotHolding(lookDescriptors)
            qCurrLookHoldingT = getqLookHolding(lookDescriptors)
            qCurrPPNotHoldingT = getqPPNotHolding(statesLookStack_t)
            qCurrPPHoldingT = getqPPHolding(statesLookStack_t)
            qCurrT = np.c_[qCurrLookNotHoldingT,qCurrPPNotHoldingT,qCurrLookHoldingT,qCurrPPHoldingT]
            
            td_error = qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] - targets
            qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] = targets

            targetTrainLookNotHolding(lookDescriptors,  np.reshape(qCurrT[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainPPNotHolding(statesLookStack_t, np.reshape(qCurrT[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainLookHolding(lookDescriptors, np.reshape(qCurrT[:,2],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainPPHolding(statesLookStack_t, np.reshape(qCurrT[:,3],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)


    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    if valueFunctionType == "TABULAR":
        qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    else:
        qPickNotHolding = getqNotHolding(actionsPickDescriptors)
        qPickHolding = getqHolding(actionsPickDescriptors)
        qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    if valueFunctionType == "TABULAR":
        qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    else:
        qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
        qPlaceHolding = getqHolding(actionsPlaceDescriptors)
        qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)    
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
def main():

    env = envstandalone.MultiGhostEvade()
    #    env = envstandalone.GhostEvade()
    #    env = envstandalone.BallCatch()

    max_timesteps = 40000
    #    max_timesteps=80000
    learning_starts = 1000
    buffer_size = 50000
    #    exploration_fraction=0.2
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    #    target_network_update_freq=500
    #    target_network_update_freq=100
    #    target_network_update_freq=10
    target_network_update_freq = 1
    learning_alpha = 0.2

    batch_size = 32
    #    batch_size=64
    #    batch_size=1024
    train_freq = 1

    obsShape = (8, 8, 1)
    deicticShape = (3, 3, 2)
    #    deicticShape = (3,3,4)
    #    deicticShape = (4,4,2)
    #    deicticShape = (4,4,4)
    #    deicticShape = (5,5,2)
    #    deicticShape = (6,6,2)
    #    deicticShape = (8,8,2)
    num_deictic_patches = 36
    #    num_deictic_patches = 25
    #    num_deictic_patches = 16
    #    num_deictic_patches = 9
    #    num_deictic_patches = 1

    #    num_actions = 4
    #    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        ##    model = models.cnn_to_mlp_2pathways(
        ##        convs=[(16,3,1)],
        convs=[(32, 3, 1)],
        ##        convs=[(32,4,1)],
        ##        convs=[(16,4,1)],
        #        hiddens=[16],
        hiddens=[32],
        dueling=True)

    # MLP version
    #    model = models.mlp([8, 16])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([16, 32])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([32, 32])
    #    model = models.mlp([32])
    #    model = models.mlp([])

    #    q_func=model
    q_func = {}

    #    lr=0.01
    lr = 0.001

    #    lr=0.0005

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)

#        # MLP version
#        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    def getTabularKeys(obsDeicticTiled):
        obsBits = np.packbits(obsDeicticTiled, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            obsKeys = obsKeys + (256**i) * np.int32(obsBits[:, i])
        return obsKeys

    def getTabular(obsDeicticTiled):
        keys = getTabularKeys(obsDeicticTiled)
        return np.array([
            q_func[x] if x in q_func else np.zeros([num_cascade, num_actions])
            for x in keys
        ])

    def trainTabular(obsDeicticTiled, qCurrTargets):
        keys = getTabularKeys(obsDeicticTiled)
        alpha = 0.75
        for i in range(len(keys)):
            if keys[i] in q_func:
                q_func[keys[i]] = (
                    1 - alpha) * q_func[keys[i]] + alpha * qCurrTargets[i]
            else:
                q_func[keys[i]] = qCurrTargets[i]

    sess = U.make_session(num_cpu)
    sess.__enter__()

    #    getq = build_getq(
    #            make_obsDeic_ph=make_obsDeic_ph,
    #            q_func=q_func,
    #            num_actions=num_actions,
    #            num_cascade=num_cascade,
    #            scope="deepq",
    #            qscope="q_func"
    #            )
    #
    #    getqTarget = build_getq(
    #            make_obsDeic_ph=make_obsDeic_ph,
    #            q_func=q_func,
    #            num_actions=num_actions,
    #            num_cascade=num_cascade,
    #            scope="deepq",
    #            qscope="q_func_target"
    #            )

    #    update_target = build_update_target(scope="deepq",
    #                                        qscope="q_func",
    #                                        qscopeTarget="q_func_target")
    #
    #    targetTrain = build_targetTrain(
    #        make_obsDeic_ph=make_obsDeic_ph,
    #        make_target_ph=make_target_ph,
    #        q_func=q_func,
    #        num_actions=env.action_space.n,
    #        num_cascade=num_cascade,
    #        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    ##        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
    #        scope="deepq",
    #        qscope="q_func",
    #        grad_norm_clipping=1.
    ##        grad_norm_clipping=0.1
    #    )

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,
                                deicticShape=deicticShape)
    #    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])

        #       TABULAR version
        qCurr = getTabular(
            np.reshape(
                obsDeictic,
                [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

        ##       CNN version
        #        qCurr = getq(np.array(obsDeictic))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        #        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
        action = np.argmax(np.max(qCurrNoise[:, 0, :],
                                  0))  # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
            #            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
            #            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: TABULAR version
            qNext = getTabular(
                np.reshape(
                    obses_tp1_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))
            qCurr = getTabular(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

            #            # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION
            #            qNextTarget = getqTarget(obses_tp1_deic)
            #            qNext = getq(obses_tp1_deic)
            #            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            #            # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS
            #            obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2))
            #            obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2))
            #            obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2))
            #            obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3]
            #            obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2))
            #            obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2))
            #            obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2))
            #            obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3]
            #            qCurr = getq(np.array(obses_t_deic))
            #            qNext = getq(np.array(obses_tp1_deic))
            #            actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3]
            #            actionsTiled = actionsTiled - 4 * (actionsTiled>3)
            #            rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled]
            #            donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled]

            # This version pairs a glimpse with the same glimpse on the next time step
            #            qNextmax = np.max(qNext[:,-1,:],1) # last elt in cascade
            qNextmax = np.max(qNext[:, 0, :], 1)  # first elt in cascade
            #            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
            #            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            # Take min over targets in same group
            #            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
            #            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
            #            for i in range(np.shape(uniqueCounts)[0]):
            #                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])

            qCurrTargets = np.copy(qCurr)

            # Copy into cascade WITHOUT pruning
            expLen = np.shape(qCurr)[0]
            for i in range(num_cascade):
                #                qCurrTargets[range(expLen),i,actionsTiled] = targets
                qCurrTargets[range(expLen), i, actionsTiled] = np.minimum(
                    qCurrTargets[range(expLen), i, actionsTiled], targets)

#            # Copy into cascade with pruning.
#            expLen = np.shape(qCurr)[0]
#            qCurrTargets[range(expLen),0,actionsTiled] = targets
#            for i in range(num_cascade-1):
#                mask = targets < qCurr[range(expLen),i,actionsTiled]
#                qCurrTargets[range(expLen),i+1,actionsTiled] = \
#                    mask*targets + \
#                    (1-mask)*qCurr[range(expLen),i+1,actionsTiled]

# TABULAR version
            trainTabular(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]),
                qCurrTargets)


#            # CNN version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    obses_t_deic,
#                    qCurrTargets
#                    )

#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )

#        # Update target network periodically.
#        if t > learning_starts and t % target_network_update_freq == 0:
#            update_target()

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs