Esempio n. 1
0
def get_demo_greedy(env, file_name):
    demo_buffer = deque()
    demo = []
    demo_sas = []
    REWARDS, REWARD100, reward100 = [], [], 0
    state = env.reset()
    vision = state['vision']
    n_tongs = env._agent._items[1]  # number of tongs carried
    q_value = value_iteration(vision, n_tongs)
    action = np.argmax(q_value[5, 5, 0] +
                       1e-2 * np.random.rand(3))  # add noise to break ties
    state = trans_state(state)
    print(DQfDConfig.demo_buffer_size)
    for steps in range(DQfDConfig.demo_buffer_size):
        next_state, reward, done, _ = env.step(action)
        reward100 += reward
        REWARDS.append(reward)
        vision = next_state['vision']
        n_tongs = env._agent._items[1]
        q_value = expert_VI(vision, n_tongs)
        action = np.argmax(q_value[5, 5, 0] + 1e-2 * np.random.rand(3))
        next_state = trans_state(next_state)
        demo.append([state, action, reward, next_state, done, 1.0])
        state = next_state

        if (steps + 1) % DQfDConfig.eps_gap == 0:
            print("demo - steps: {}  reward100: {}".format(steps, reward100))
            REWARD100.append(reward100)
            reward100 = 0

        if (steps + 1) % (DQfDConfig.eps_gap * 10) == 0:
            with open(file_name + 'REWARD100.p', 'wb') as f:
                pickle.dump(REWARD100, f, protocol=2)
            with open(file_name + 'REWARD100.txt', 'wb') as f:
                f.write(str(REWARD100))
            with open(file_name + 'REWARDS.p', 'wb') as f:
                pickle.dump(REWARDS, f, protocol=2)
            with open(file_name + 'REWARDS.txt', 'wb') as f:
                f.write(str(REWARDS))
            plot(1, REWARDS, file_name)

    with open(file_name + 'REWARD100.p', 'wb') as f:
        pickle.dump(REWARD100, f, protocol=2)
    with open(file_name + 'REWARD100.txt', 'wb') as f:
        f.write(str(REWARD100))
    with open(file_name + 'REWARDS.p', 'wb') as f:
        pickle.dump(REWARDS, f, protocol=2)
    with open(file_name + 'REWARDS.txt', 'wb') as f:
        f.write(str(REWARDS))
    plot(1, REWARDS, file_name)

    demo = set_n_step(demo, DQfDConfig.trajectory_n)
    demo_buffer.extend(demo)
    with open(file_name + 'demo.txt', "w") as file:
        file.write(str(demo_buffer))
    with open(file_name + 'demo.p', 'wb') as f:
        pickle.dump(demo_buffer, f, protocol=2)
Esempio n. 2
0
def main(filename):
    with open(filename, 'r') as f:
        rewards = f.read().splitlines()

    rewards = [float(r) for r in rewards]
    #rew = [0]*len(rewards)*100
    #for i, r in enumerate(rewards):
    # rew[(i+1)*100 - 1] = r*100

    plot('./', rewards[:50000])
Esempio n. 3
0
    def plot_train_stats(self):
        for tr in self.train_rewards[-1000:]:
            self.train_file.write(str(tr))
            self.train_file.write('\n')
        self.train_file.flush()
        if not self.test and self.train_rewards[-1] > 0:
            self.net.A.save("checkpoint.pth")
            self.net.Ensemble.save()
        if self.train_rewards[-1] > 0:
            print('[%d] Train Reward: %.4f' %
                  (len(self.train_rewards), self.train_rewards[-1]))
        self.steps = 0

        x = list(range(len(self.train_rewards)))
        plt.plot(x, self.train_rewards, '-bo')
        plt.xlabel('Time')
        plt.ylabel('Average Reward')
        plt.title('Training Curve')
        plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png')
        plt.close()

        plot(self.dump_dir + self.method, self.train_rewards)
Esempio n. 4
0
    def plot_train_stats(self):
        self.cum_reward = self.cum_reward / float(self.log_time)
        self.train_rewards.append(self.cum_reward)
        self.train_file.write(str(self.cum_reward))
        self.train_file.write('\n')
        self.train_file.flush()
        self.cum_reward = 0.0
        if self.train_rewards[-1] > 0:
            self.net.A.save("checkpoint.pth")
            print('[%d] Train Reward: %.4f' %
                  (len(self.train_rewards), self.train_rewards[-1]))
        self.steps = 0

        x = list(range(len(self.train_rewards)))
        plt.plot(x, self.train_rewards, '-bo')
        plt.xlabel('Time')
        plt.ylabel('Average Reward')
        plt.title('Training Curve')
        plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png')
        plt.close()

        plot(self.dump_dir + self.method, self.train_rewards)
Esempio n. 5
0
def run_DQfD(index, env, file_demo, file_name):
    with open(file_demo + 'demo.p', 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(
            itertools.islice(demo_transitions, 0, DQfDConfig.demo_buffer_size))
        assert len(demo_transitions) == DQfDConfig.demo_buffer_size
    with tf.variable_scope('DQfD_' + str(index)):
        agent = DQfD(env, DQfDConfig(), demo_transitions=demo_transitions)

    agent.pre_train()  # use the demo data to pre-train network

    REWARDS, REWARD100, episode, replay_full_episode = [], [], 0, None
    reward100, n_step_reward, state = 0, None, env.reset()
    state = trans_state(state)
    t_q = deque(maxlen=DQfDConfig.trajectory_n)
    for steps in range(DQfDConfig.episode):
        action = agent.egreedy_action(state)  # e-greedy action for train
        next_state, reward, done, _ = env.step(action)
        next_state = trans_state(next_state)
        reward100 += reward
        REWARDS.append(reward)
        t_q.append([state, action, reward, next_state, done, 0.0])

        # record the earliest reward for the sub-sequence
        if len(t_q) < t_q.maxlen:
            reward_to_sub = 0.
        else:
            reward_to_sub = t_q[0][2]
            if n_step_reward is None:  # only compute once when t_q first filled
                n_step_reward = sum(
                    [t[2] * DQfDConfig.GAMMA**i for i, t in enumerate(t_q)])
            else:
                n_step_reward = (n_step_reward -
                                 reward_to_sub) / DQfDConfig.GAMMA
                n_step_reward += reward * DQfDConfig.GAMMA**(
                    DQfDConfig.trajectory_n - 1)

            t_q[0].extend([n_step_reward, next_state, done,
                           t_q.maxlen])  # actual_n is max_len here
            update_eps = True if (steps +
                                  1) % DQfDConfig.eps_gap == 0 else False
            agent.perceive(t_q[0], update_eps=update_eps
                           )  # perceive when a transition is completed
            if (steps + 1) % DQfDConfig.UPDATE_ESTIMATE_NET == 0:
                agent.train_Q_network(
                    update=False)  # train along with generation
            replay_full_episode = replay_full_episode or episode

        state = next_state

        if (steps + 1) % DQfDConfig.UPDATE_TARGET_NET == 0:
            if agent.replay_memory.full():
                agent.sess.run(agent.update_target_net)

        if (steps + 1) % DQfDConfig.eps_gap == 0:
            episode += 1
            if replay_full_episode is not None:
                print(
                    "episode: {}  trained-episode: {}  reward100: {}  memory length: {}  epsilon: {}"
                    .format(episode, episode - replay_full_episode, reward100,
                            len(agent.replay_memory), agent.epsilon))
            REWARD100.append(reward100)
            reward100 = 0

        if (steps + 1) % (DQfDConfig.eps_gap * 100) == 0:
            with open(file_name + 'REWARD100.p', 'wb') as f:
                pickle.dump(REWARD100, f, protocol=2)
            with open(file_name + 'REWARD100.txt', 'wb') as f:
                f.write(str(REWARD100))
            with open(file_name + 'REWARDS.p', 'wb') as f:
                pickle.dump(REWARDS, f, protocol=2)
            with open(file_name + 'REWARDS.txt', 'wb') as f:
                f.write(str(REWARDS))
            plot(1, REWARDS, file_name)

    with open(file_name + 'REWARD100.p', 'wb') as f:
        pickle.dump(REWARD100, f, protocol=2)
    with open(file_name + 'REWARD100.txt', 'wb') as f:
        f.write(str(REWARD100))
    with open(file_name + 'REWARDS.p', 'wb') as f:
        pickle.dump(REWARDS, f, protocol=2)
    with open(file_name + 'REWARDS.txt', 'wb') as f:
        f.write(str(REWARDS))
    plot(1, REWARDS, file_name)
Esempio n. 6
0
    return q_value


def run_policy(env):
    env.reset()
    observation, _, _, _ = env.step(0)
    reward_list = []
    for step in range(1000000):
        # env.render()
        vision = observation['vision']
        n_tongs = env._agent._items[1]  # number of tongs carried
        q_value = value_iteration(vision, n_tongs)
        action = np.argmax(q_value[5, 5, 0] +
                           1e-2 * np.random.rand(3))  # add noise to break ties
        observation, reward, _, _ = env.step(action)
        reward_list.append(reward)
        if step % 100 == 0:
            print('step = {:d}'.format(step))
        if (step + 1) % 10000 == 0:
            num = (step + 1) // 10000
            with open('greedy_{:d}.pickle'.format(num), 'wb') as f:
                pickle.dump(reward_list, f)
    return reward_list


if __name__ == '__main__':
    env = gym.make('NEL-v0')
    # env_r = gym.make('NEL-render-v0')
    reward_list = run_policy(env)
    canonical_plot.plot('greedy', reward_list)
if __name__ == '__main__':

    # Setting the session to allow growth, so it doesn't allocate all GPU memory.
    gpu_ops = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(gpu_options=gpu_ops)
    sess = tf.Session(config=config)

    # Setting this as the default tensorflow session.
    keras.backend.tensorflow_backend.set_session(sess)

    # Gather commandline args
    args = parse_arguments()
    environment_name = args.env
    model_name = args.model_name

    agent = Deep_Agent(environment_name,
                       model_name,
                       num_episodes=num_episodes,
                       curve_episodes=1000)
    agent.burn_in_memory()
    training_step_rewards = agent.train(n_steps_to_reset=100000000000000000)
    plot("tsr", training_step_rewards)
    rewards = agent.get_rewards(10000)
    plot("pr", rewards)
    filename = "rewards_file"
    with open(str(filename) + ".pkl", 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump([training_step_rewards, rewards], f, protocol=2)
#    u, std = agent.test_stats(10,100)  # 6.e
#    agent.performance_curves_from_weight_files(10, 100)
#    agent.plots()
Esempio n. 8
0
    rewards = []
    state = env.reset()
    state = trans_state(state)
    #    state = np.expand_dims(state,0)
    for step in range(n_steps):
        #        env.render()
        action = np.rint(
            yhat.eval(feed_dict={x: state[None, :]})[0][0]).astype(int)
        #        print action
        next_state, reward, done, info = env.step(action)
        next_state = trans_state(next_state)
        #        next_state = np.expand_dims(next_state,0)
        state = next_state

        rewards.append(reward)
dagger_results = {
    'means': save_mean,
    'stds': save_std,
    'train_size': save_train_size,
    'expert_mean': save_expert_mean,
    'expert_std': save_expert_std
}
print 'DAgger iterations finished!'

print dagger_results
#plot("dagger_tr",tr_rewards)
plot("dagger_pr", rewards)
filename = "dagger_rewards_file"
with open(str(filename) + ".pkl", 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([rewards, dagger_results], f, protocol=2)
Esempio n. 9
0
    def train(self):
        train_rewards = []
        test_rewards = []
        count = 0
        steps = 0
        test_steps = 0

        cum_reward = 0.0
        elapsed = 0.0

        curr_state = self.env.reset()
        curr_state = self.burn_in_memory(curr_state)
        prev_action = -1
        if self.render:
            self.env.render()
        for i in range(self.training_time):
            # Get q_values based on the current state
            Vt, St = self.get_input_tensor(curr_state)
            q_values = self.net.get_Q_output(Vt, St)

            # Selecting an action based on the policy
            action = self.epsilon_greedy_policy(q_values, self.epsilon)
            #if not curr_state['moved'] and action == prev_action and self.epsilon > 0.1:
            #  action = self.epsilon_greedy_policy(q_values, 0.5)

            # Executing action in simulator
            nextstate, reward, _, _ = self.env.step(action)
            steps = steps + 1
            test_steps = test_steps + 1
            if self.render:
                self.env.render()

            # Store Transition
            if nextstate['moved'] or prev_action != action:
                self.replay_buffer.add(curr_state, action, reward / 100.0,
                                       nextstate, 0)
            prev_action = action

            # Sample random minibatch from experience replay
            if self.prioritized_replay:
                batch, weights, batch_idxes = self.replay_buffer.sample(
                    self.batch_size, beta=self.beta_schedule.value(i))
            else:
                batch = self.replay_buffer.sample(self.batch_size)
                weights, batch_idxes = np.ones(self.batch_size), None

            # Train the Network with mini batches
            xVT, xST = self.get_input_tensors(batch)
            yT = self.get_output_tensors(batch)

            # Mask to select the actions from the Q network output
            mT = torch.zeros(self.batch_size, self.an, dtype=torch.uint8)
            for k, tran in enumerate(batch):
                mT[k, tran[1]] = 1
            td_errors = self.net.train(xVT, xST, yT, mT, weights)

            if self.prioritized_replay:
                #new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                #new_priorities = []
                #for i, tran in enumerate(batch):
                #  new_priorities.append(tran[2] + self.prioritized_replay_eps)
                self.replay_buffer.update_priorities(batch_idxes, weights)

            # Decay epsilon
            self.update_epsilon()

            cum_reward += reward
            curr_state = nextstate

            if steps == 100:
                cum_reward = cum_reward / float(self.log_time)
                train_rewards.append(cum_reward)
                self.train_file.write(str(cum_reward))
                self.train_file.write('\n')
                self.train_file.flush()
                cum_reward = 0.0
                print('Train Reward: %.4f' % (train_rewards[-1]))
                steps = 0

                x = list(range(len(train_rewards)))
                plt.plot(x, train_rewards, '-bo')
                plt.xlabel('Time')
                plt.ylabel('Average Reward')
                plt.title('Training Curve')
                plt.savefig(self.dump_dir + 'Training_Curve_' + self.method +
                            '.png')
                plt.close()

                plot(self.dump_dir + self.method, train_rewards)


#      if test_steps == 500:
#        self.net.set_eval()
#        test_rewards.append(self.test())
#        self.test_file.write(str(test_rewards[-1]))
#        self.test_file.write('\n')
#        self.test_file.flush()
#        self.net.set_train()
#        count = count + 1
#        print('\nTest Reward: %.4f\n' % (test_rewards[-1]))
#        test_steps = 0
#
#        x = list(range(len(test_rewards)))
#        plt.plot(x, test_rewards, '-bo')
#        plt.xlabel('Time')
#        plt.ylabel('Average Reward')
#        plt.title('Testing Curve')
#        plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png')
#        plt.close()

            if count > 0 and count % 30 == 0:
                self.net.save_model_weights(count, self.dump_dir)