def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
    def trainTabular(vectorKey,qCurrTargets):
        keys = getTabularKeys(vectorKey)
        for i in range(len(keys)):
            if keys[i] in q_func:
                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
                q_func[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters
#    learning_starts=10
#    buffer_size=50000
#    buffer_size=10000
#    buffer_size=100
#    buffer_size=2
#    batch_size=8
    num_cpu = 16
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
    replay_buffer = ReplayBuffer(buffer_size)

    # Deictic state/action parameters
    deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd
    num_cascade = 5
    num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)
    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    # Start tensorflow session
    sess = U.make_session(num_cpu)

    episode_rewards = [0.0]
    timerStart = time.time()
    obs = env.reset()
    for t in range(max_timesteps):
        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1] # obj in hand

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = np.reshape(getMoveActionDescriptors([obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
        actionDescriptors = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptors],np.c_[np.ones([num_patches,1])==1,moveDescriptors]]

        # Get q-values
        qCurr = getTabular(actionDescriptors)
        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:,stateDeictic])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
            moveDescriptorsNext1Tiled = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]])
            actionDescriptorsNext1Tiled = np.stack(
            actionDescriptorsNext = np.reshape(actionDescriptorsNext1Tiled,[batch_size*2*num_patches,-1])
            qNext1 = getTabular(actionDescriptorsNext)
            states_tp1Full = np.repeat(states_tp1,2*num_patches)
            qNextTiled = np.reshape(qNext1[range(2*batch_size*num_patches),states_tp1Full],[batch_size,2,num_patches,-1])
            qNextmax = np.max(np.max(np.max(qNextTiled,3),2),1)
            targets = rewards + (1-dones) * gamma * qNextmax

            qCurrTarget = getTabular(actions)
            qCurrTarget[range(batch_size),states_tp1] = np.minimum(qCurrTarget[range(batch_size),states_tp1], targets)

            # ********************************************
#            # Sample from replay buffer
#            states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
#            # Get action set: <num_patches> pick actions followed by <num_patches> place actions
#            moveDescriptorsNext = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            actionDescriptorsNext = np.stack([np.c_[np.zeros([batch_size,num_patches,1])==1,moveDescriptorsNext], 
#                            np.c_[np.ones([batch_size,num_patches,1])==1,moveDescriptorsNext]],
#                            axis=1)
#            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*2*num_patches,-1])
#            # Get targets
#            qNext = getTabular(actionDescriptorsNext)
#            np.repeat(states_tp1,2*num_patches)
#            qNextAtState = qNext[range(batch_size*2*num_patches),np.repeat(states_tp1,2*num_patches)]
#            qNextTiled = np.reshape(qNextAtState,[batch_size,2*num_patches])
#            qNextmax = np.max(qNextTiled,1)
#            targets = rewards + (1-dones) * gamma * qNextmax
#            qCurrTarget = getTabular(actions)
#            qCurrTarget[range(batch_size),states_t] = targets
#            trainTabular(actions,qCurrTarget)
            # ********************************************

#        # Get state: in range(0,env.num_blocks)
#        stateDeicticNext = new_obs[1] # holding
#        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
#        moveDescriptorsNext = np.reshape(getMoveActionDescriptors([new_obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#        actionDescriptorsNext = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptorsNext],np.c_[np.ones([num_patches,1])==1,moveDescriptorsNext]]
#        # Calculate TD target
#        qNext = getTabular(actionDescriptorsNext)
#        qNextmax = np.max(qNext[:,stateDeicticNext])
#        target = rew + (1-done) * gamma * qNextmax
#        # Update dictionary value function
#        qCurrTarget = qCurr[action,:]
#        qCurrTarget[stateDeictic] = np.minimum(qCurrTarget[stateDeictic], target)
#        trainTabular([actionDescriptors[action,:]],[qCurrTarget])

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        obs = new_obs
Example #2
class Agent():
    def __init__(self, state_size, action_size, config):
        self.action_size = action_size
        self.state_size = state_size
        self.Q = np.zeros([state_size, action_size])
        self.Q_inverse = np.zeros([state_size, action_size])
        self.debug_Q = np.zeros([state_size, action_size])
        self.Q_shift = np.zeros([state_size, action_size])
        self.r = np.zeros([state_size, action_size])  
        self.counter = np.zeros([state_size, action_size])
        self.gamma = config["gamma"]
        self.epsilon = 1 = config["lr"]
        self.lr_iql_q = config["lr_iql_q"]
        self.lr_iql_r = config["lr_iql_r"]
        self.min_epsilon = config["min_epsilon"]
        self.max_epsilon =1
        self.episode = 15000
        self.decay = config["decay"]
        self.total_reward = 0
        self.eval_frq = 50
        self.render_env = False
        self.env = gym.make(config["env_name"])
        self.memory = ReplayBuffer((1,),(1,),config["buffer_size"], config["device"])
        self.gamma_iql = 0.99
        self.gamma_iql = 0.99
        self.lr_sh = config["lr_q_sh"]
        self.ratio = 1. / action_size
        self.eval_q_inverse = 50000
        self.episodes_qinverse = int(5e6)
        self.update_freq = config['freq_q']
        self.steps = 0
        pathname = "lr_inv_q {} lr_inv_r {} freq {}".format(self.lr_iql_q, self.lr_iql_r, self.update_freq)
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname 
        self.writer = SummaryWriter(tensorboard_name)
        tensorboard_name = str(config["locexp"]) + '/runs/' + "inverse" 
        self.writer_inverse = SummaryWriter(tensorboard_name)
        tensorboard_name = str(config["locexp"]) + '/runs/' + "expert" 
        self.writer_expert = SummaryWriter(tensorboard_name)
        self.last_100_reward_errors = deque(maxlen=100) 
        self.average_same_action = deque(maxlen=100) 
        self.expert_buffer_size = config["expert_buffer_size"]
    def act(self, state, epsilon, eval_pi=False, use_debug=False):

        if np.random.random() > epsilon or eval_pi:
            action = np.argmax(self.Q[state])
            if use_debug:
                action = np.argmax(self.debug_Q[state])
            action = self.env.action_space.sample() 
        return action
    def act_inverse_q(self, state):
        action = np.argmax(self.Q_inverse[state])
        return action
    def optimize(self, state, action, reward, next_state, debug=False):
        if debug:
            max_next_state = np.max(self.debug_Q[next_state])
            td_error =  max_next_state - self.debug_Q[state, action]
            self.debug_Q[(state,action)] = self.debug_Q[(state,action)] + * (reward + self.gamma *td_error)

        max_next_state = np.max(self.Q[next_state])
        td_error =  max_next_state - self.Q[state, action]
        self.Q[(state,action)] = self.Q[(state,action)] + * (reward + self.gamma *td_error)
    def learn(self):
        states, actions, rewards, next_states, done =  self.memory.sample(self.batch_size)
        # update Q function
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, done):
            max_next_state = np.max(self.Q[next_state])
            td_error = self.Q[state, action] - max_next_state
            self.Q[(state,action)] = self.Q[(state,action)] + * (reward + self.gamma*  td_error)
    def compute_reward_loss(self, episode=10):
        use the env to create the real reward and compare it to the predicted
        reward of the model
        reward_loss = 0
        reward_list = []
        for epi in range(episode):
            state = self.env.reset()
            done = False
            while not done:
                action = np.argmax(self.trained_Q[state])
                next_state, reward, done, _ = self.env.step(action)
                predict_reward = self.r[state, action]
                reward_list.append((reward, predict_reward))
                if done: 
        reward_loss =([abs(r[0] - r[1]) for r in reward_list]  )
        reward_loss_length = len(reward_loss)
        reward_loss = sum(reward_loss) / reward_loss_length
        average_loss = np.mean(self.last_100_reward_errors)
        print("average mean loss ", average_loss)
        self.writer.add_scalar('Reward_loss', reward_loss, self.steps)
        self.writer.add_scalar('Average_Reward_loss', average_loss, self.steps)

    def invers_q(self, continue_train=False):
        if not continue_train:
            print("clean policy")
            self.Q = np.zeros([self.state_size, self.action_size])
        mkdir("", "inverse_policy") 
        for epi in range(1, self.episodes_qinverse + 1):
            self.steps += 1
            text = "Inverse Episode {} \r".format(epi)
            # print(text, end = '')
            if epi % self.eval_q_inverse == 0:
                self.render_env = False
                self.eval_policy(use_inverse=True, episode=5)
                self.eval_policy(use_expert=True, episode=5)
                self.render_env =False
            state, action, r, next_state, _ = self.memory.sample(1)
            action = action[0][0]
            state = state[0][0]
            next_state = next_state[0][0]
            self.counter[state, action] += 1
            total_num = np.sum(self.counter[state,:])
            action_prob = self.counter[state] / total_num
            # update Q shift 
            Q_shift_target = self.lr_sh * (self.gamma_iql * np.max(self.Q_inverse[next_state]))
            #print("q values", self.Q[state])
            self.Q_shift[state, action] = ((1 - self.lr_sh) * self.Q_shift[state, action]) + Q_shift_target
            # compute n a
            if action_prob[action] == 0:
                action_prob[action] =  np.finfo(float).eps
            n_a = np.log(action_prob[action]) - self.Q_shift[state, action]
            # update reward function
            self.update_r(state, action, n_a, action_prob)
            # update Q function
            self.update_q(state, action, next_state)
            # self.policy_diff(state, action)

    def update_q(self, state, action, next_state):
        q_old = (1 - self.lr_iql_q) * self.Q_inverse[state, action]
        q_new = self.lr_iql_q *(self.r[state, action] + (self.gamma_iql * np.max(self.Q_inverse[next_state])))
        #print("q old ", q_old)
        #print("q_new", q_new)
        #print("q invers ", q_old + q_new)
        self.Q_inverse[state, action] = q_old + q_new
    def update_r(self, state, action, n_a, action_prob):
        r_old = (1 - self.lr_iql_r) * self.r[state, action]
        part1 = n_a
        #print("part1", n_a)
        part2 = self.ratio * self.sum_over_action(state, action, action_prob)
        r_new = self.lr_iql_r * (part1 + part2)
        #print("r old ", r_old)
        #print("r_new", r_new)
        self.r[state, action] = r_old + r_new       
    def sum_over_action(self, state, a, action_prob):
        res = 0
        for b in range(self.action_size):
            if b == a:
            res = res + (self.r[state, b] - self.compute_n_a(state, b, action_prob))
        return res

    def compute_n_a(self, state, action, action_prob):
        if action_prob[action] == 0:
            action_prob[action] = np.finfo(float).eps
        return np.log(action_prob[action]) - self.Q_shift[state, action]

    def start_reward(self):
        self.env.seed = 1
        state = self.env.reset()
        ns, r, d, _ = self.env.step(0)
        print(" expert q {}".format(self.trained_Q[state])) 
        print("inverse q {}".format(self.Q_inverse[state]))

    def eval_policy(self, random_agent=False, use_expert=False, use_debug=False, use_inverse=False,episode=10):
        if use_expert:
        total_steps = 0
        total_reward = 0
        total_penetlies = 0
        for i_episode in range(1, episode + 1):
            score = 0
            steps = 0
            state = self.env.reset()
            done  = False
            penelty = 0
            while not done:
                steps += 1
                if use_expert:
                    action = np.argmax(self.trained_Q[state])
                elif random_agent:
                    action = self.env.action_space.sample() 
                elif use_debug:
                    action = np.argmax(self.debug_Q[state])
                elif use_inverse:
                    action = np.argmax(self.Q_inverse[state])
                    action = self.act(state, 0, True)
                next_state, reward, done, _ = self.env.step(action)
                state = next_state
                if self.render_env:
                score += reward
                if reward == -10:
                    penelty += 1
                if done:
                    total_steps += steps
                    total_reward += score
                    total_penetlies += penelty
        if self.render_env:
        aver_steps = total_steps / episode
        average_reward = total_reward / episode
        aver_penelties = total_penetlies / episode
        if use_expert:
            print("Expert avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))

        elif random_agent:
            print("Random Eval avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))
        elif use_inverse:
            print("Inverse q Eval avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))
            print("Eval avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))
            self.writer.add_scalar('Eval_Average_steps', aver_steps, self.steps)
            self.writer.add_scalar('Eval_Average_reward', average_reward, self.steps)
            self.writer.add_scalar('Eval_Average_penelties', aver_penelties, self.steps)
    def save_q_table(self, table="Q", filename="policy"):
        mkdir("", filename)
        if table == "Q":
            with open(filename + '/Q.npy', 'wb') as f:
      , self.Q)
        if table =="inverse_Q":
            with open(filename + '/Inverse_Q.npy', 'wb') as f:
      , self.Q_inverse)

    def load_q_table(self, table="Q", filename="policy"):
        if table == "Q":
            with open(filename + '/Q.npy', 'rb') as f:
                self.Q = np.load(f)
        if table == "inverse_Q":
            with open(filename + '/Inverse_Q.npy', 'rb') as f:
                self.Q_inverse = np.load(f)

        self.trained_Q = self.Q
    def save_r_table(self, filename="reward_function"):
        mkdir("", filename)
        with open(filename + '/r.npy', 'wb') as f:
  , self.r)

    def load_r_table(self, filename="reward_function"):
        with open(filename + '/r.npy', 'rb') as f:
            self.r = np.load(f)

    def eval_inverse(self):
        self.load_q_table(table= "inverse_Q")
        for i_episode in range(1, 11):
            score = 0
            steps = 0
            penelties = 0
            state = self.env.reset()
            done  = False
            while not done:
                steps += 1
                action = np.argmax(self.Q_inverse[state])
                next_state, reward, done, _ = self.env.step(action)
                score += reward
                if reward == -10:
                    penelties += 1
                state = next_state
            print("Inverse  steps {} reward  {:.2f}  penelty {} ".format(steps, score, penelties))

    def policy_diff(self, state, expert_action):

        self.trained_Q = self.Q

    def create_expert_policy(self):
        self.trained_Q = self.Q
        for i_episode in range(1, self.expert_buffer_size + 1):
            text = "create Buffer {} of {}\r".format(i_episode, self.expert_buffer_size)
            print(text, end=" ")
            state = self.env.reset()
            if state == 184:
                print("yes ")
            done  = False
            score = 0
            while True:
                action = self.act(state, 0, True)
                next_state, reward, done, _ = self.env.step(action)
                score += reward
                self.memory.add(state, action, reward, next_state, done, done)
                state = next_state
                if done:
                    #print("reward ", score)

    def policy_diff(self, state, expert_action):
        action = np.argmax(self.Q_inverse[state])
        if action == expert_action:
            print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {}  epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon))
            self.writer.add_scalar('Average_reward', average_reward, self.steps)
            self.writer.add_scalar('Train_reward', score, self.steps)
        self.trained_Q = self.Q
    def debug_train(self):

        use the trained reward function to train the agent

        state = self.env.reset()
        done  = False
        score = 0
        self.steps += 1
        epsiode_steps =  0
        while True:
            action = self.act(state, 0, True)
            next_state, _, done, _ = self.env.step(action)
            reward = self.r[state, action]
            self.optimize(state, action, reward, next_state, debug=True)

            score += reward
            epsiode_steps += 1
            if done:
            state = next_state

        self.total_reward += score
        average_reward = self.total_reward / self.steps
        print("Episode {} Reward {:.2f} Average Reward {:.2f}  epi steps {}".format(self.steps, score, average_reward, epsiode_steps))

    def train(self):
        total_timestep = 0
        for i_episode in range(1, self.episode + 1):
            score = 0
            state = self.env.reset()
            done  = False
            steps = 0
            while not done:
                self.steps +=1
                steps += 1
                total_timestep += 1
                action = self.act(state, self.epsilon)
                next_state, reward, done, _ = self.env.step(action)
                score += reward
                self.optimize(state, action, reward, next_state)
                self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay * i_episode)
                if done:
                state = next_state
            if i_episode % self.eval_frq == 0:
            self.total_reward += score
            average_reward = self.total_reward / i_episode
            print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {}  epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon))
            self.writer.add_scalar('Average_reward', average_reward, self.steps)
            self.writer.add_scalar('Train_reward', score, self.steps)
        self.trained_Q = self.Q
Example #3
def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_dict = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        #        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([
            q_func_dict[x] if x in q_func_dict else 0 *
            np.ones([num_cascade, num_states]) for x in keys

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.3
        for i in range(len(keys)):
            if keys[i] in q_func_dict:
                q_func_dict[keys[i]] = (
                    1 - alpha) * q_func_dict[keys[i]] + alpha * qCurrTargets[i]
                q_func_dict[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters
    max_timesteps = 40000
    #    max_timesteps=80000
    #    max_timesteps=160000
    learning_starts = 1000
    #    buffer_size=50000
    buffer_size = 10000
    #    buffer_size=1000
    #    buffer_size=100
    #    buffer_size=2
    #    exploration_fraction=0.4
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 1
    #    gamma=.98
    gamma = .96
    target_network_update_freq = 1
    #    batch_size=32
    batch_size = 64
    #    batch_size=128
    #    batch_size=256
    #    batch_size=8
    #    train_freq=1
    train_freq = 2
    #    train_freq=4
    #    train_freq=8
    #    train_freq=16
    num_train_iter = 1
    num_cpu = 16
    lr = 0.001
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
    replay_buffer = ReplayBuffer(buffer_size)

    # Deictic state/action parameters
    deicticShape = (3, 3, 2
                    )  # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (
        3, 3, 4)  # IMPORTANT: first two elts of deicticShape must be odd
    num_cascade = 5
    num_states = env.num_blocks + 1  # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches

    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
        #    q_func = models.cnn_to_mlp_2pathways(
        convs=[(32, 3, 1)],
        #        convs=[(16,3,1)],

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_states], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=deicticActionShape)

    getq = build_getq(make_actionDeic_ph=make_actionDeic_ph,

    targetTrain = build_targetTrain(
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        #        grad_norm_clipping=0.1

    # Start tensorflow session
    sess = U.make_session(num_cpu)

    episode_rewards = [0.0]
    timerStart = time.time()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1]  # obj in hand

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions

        moveDescriptors = getMoveActionDescriptors([obs[0]])
        #        actionsPickDescriptors = np.concatenate([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        #        actionsPlaceDescriptors = np.concatenate([np.ones(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionsPickDescriptors = np.concatenate(
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.concatenate(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,

        #        # TABULAR version
        #        actionDescriptors = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1
        #        qCurr = getTabular(actionDescriptors)

        # DQN version
        qCurr = getq(actionDescriptors)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, -1, stateDeictic])  # USE CASCADE
        #        action = np.argmax(qCurrNoise[:,0,stateDeictic]) # NO CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew,
                          new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            for iter in range(num_train_iter):

                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(

                moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
                #                actionsPickDescriptorsNext = np.concatenate([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3)
                #                actionsPlaceDescriptorsNext = np.concatenate([np.ones(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3)
                actionsPickDescriptorsNext = np.concatenate([
                actionsPlaceDescriptorsNext = np.concatenate([
                actionDescriptorsNextFlat = np.stack(
                    [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext],

                #            # TABULAR version
                #            actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat,[batch_size*2*num_patches,-1]) == 1
                #            qNext = getTabular(actionDescriptorsNext)

                # DQN version
                actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat, [
                    batch_size * 2 * num_patches, deicticActionShape[0],
                    deicticActionShape[1], deicticActionShape[2]
                ]) == 1
                qNext = getq(actionDescriptorsNext)

                states_tp1Full = np.repeat(states_tp1, 2 * num_patches)

                qNextTiled = np.reshape(
                    qNext[range(2 * batch_size * num_patches), -1,
                    [batch_size, 2, num_patches, -1])  # USE CASCADE
                #            qNextTiled = np.reshape(qNext[range(2*batch_size*num_patches),0,states_tp1Full],[batch_size,2,num_patches,-1]) # NO CASCADE
                qNextmax = np.max(np.max(np.max(qNextTiled, 3), 2), 1)

                targets = rewards + (1 - dones) * gamma * qNextmax

                #            # TABULAR version
                #            qCurr = getTabular(actions)

                # DQN version
                qCurr = getq(actions)

                qCurrTarget = np.copy(qCurr)
                qCurrTarget[range(batch_size), 0, states_tp1] = targets
                for i in range(num_cascade - 1):
                    mask = targets < qCurr[range(batch_size), i, states_tp1]
                    qCurrTarget[range(batch_size),i+1,states_tp1] = \
                        mask*targets + \

    #            # TABULAR version
    #            trainTabular(actions,qCurrTarget)

    # DQN version
                targetTrain(actions, qCurrTarget)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Example #4
def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func[x] if x in q_func else 0 * np.ones(num_states) for x in keys

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 1.0
        for i in range(len(keys)):
            if keys[i] in q_func:
                q_func[keys[i]] = (
                    1 - alpha) * q_func[keys[i]] + alpha * qCurrTargets[i]
                q_func[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters
    max_timesteps = 40000
    #    learning_starts=1000
    learning_starts = 10
    #    buffer_size=50000
    #    buffer_size=10000
    #    buffer_size=1000
    #    buffer_size=100
    #    buffer_size=32
    buffer_size = 8
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 1
    gamma = .98
    target_network_update_freq = 1
    #    batch_size=32
    batch_size = 1
    train_freq = 1
    #    train_freq=2
    num_cpu = 16
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
    replay_buffer = ReplayBuffer(buffer_size)

    # Deictic state/action parameters
    deicticShape = (3, 3, 2
                    )  # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3, 3, 4)
    num_cascade = 5
    num_states = env.num_blocks + 1  # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches
    num_actions_discrete = 2

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Start tensorflow session
    sess = U.make_session(num_cpu)

    episode_rewards = [0.0]
    timerStart = time.time()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1]  # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        actionsPickDescriptors = np.concatenate(
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.concatenate(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
        actionDescriptors = np.reshape(actionDescriptors, [
            -1, deicticActionShape[0] * deicticActionShape[1] *
        ]) == 1

        # Get q-values
        qCurr = getTabular(actionDescriptors)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, stateDeictic])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew,
                          new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(

            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            actionsPickDescriptorsNext1 = np.concatenate([
            actionsPlaceDescriptorsNext1 = np.concatenate([
                np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1
            actionDescriptorsNext1 = np.stack(
                [actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1],
            actionDescriptorsNextFlat1 = np.reshape(
                [batch_size * num_patches * num_actions_discrete, -1]) == 1

            qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            qNext1 = np.reshape(
                [batch_size, num_patches, num_actions_discrete, num_states])
            qNextmax1 = np.max(
                np.max(qNext1[range(batch_size), :, :, states_tp1], 2), 1)
            targets1 = rewards + (1 - dones) * gamma * qNextmax1

            qCurrTarget1 = getTabular(actions)
            qCurrTarget1[range(batch_size), states_t] = targets1
            trainTabular(actions, qCurrTarget1)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    actionsPickDescriptors = np.concatenate(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.concatenate(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    #    qPick = getq(actionsPickDescriptors)
    qPick = getTabular(
        np.reshape(actionsPickDescriptors, [num_patches, -1]) == 1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [8, 8])))

    #    qPlace = getq(actionsPlaceDescriptors)
    qPlace = getTabular(
        np.reshape(actionsPlaceDescriptors, [num_patches, -1]) == 1)
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [8, 8])))

    print("Value function for place action in hold-2 state:")
    print(str(np.reshape(qPlace[:, 2], [8, 8])))