def play_deterministic(self, n_tot):

        self.model.eval()
        env = Env()
        render = args.render

        n_human = 60
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()

            observation = next(humans_trajectories)
            print("Observation %s" % observation)
            trajectory = self.data[observation]

            j = 0

            ims = []
            # fig = plt.figure()
            while not env.t:

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    # im = plt.imshow(np.rollaxis(env.s.numpy().squeeze(0)[:3], 0, 3), animated=True)
                    # ims.append([im])
                    if self.cuda:
                        s = Variable(env.s.cuda(), requires_grad=False)
                    else:
                        s = Variable(env.s, requires_grad=False)
                    _, _, beta, _, _, _ = self.model(s)

                    beta = beta.squeeze(0)
                    beta = beta.sign().int() * (beta.abs() > 0.5).int()
                    a = reverse_excitation_index[tuple(beta.data)]

                env.step(a)

                j += 1

            # if render:
            #     ani = animation.ArtistAnimation(fig, ims, interval=10, blit=True,
            #                                     repeat=False)
            #     plt.show()

            yield env.score
Example #2
0
    def play_episode_deterministic(self, n_tot):
        self.model.eval()
        env = Env()

        n_human = 300
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                v, q, beta, r, p, phi = self.model(s)
                beta = beta.squeeze(0)

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    beta_index = (beta.sign().int() *
                                  (beta.abs() > 0.5).int()).data.cpu().numpy()
                    beta_index[0] = abs(beta_index[0])
                    a = reverse_excitation_index[tuple(beta_index.data)]

                env.step(a)

                # x = phi.squeeze(0).data.cpu().numpy()
                # print(np.mean(abs(x)))
                # yield v, q, beta, r, p, s
                yield {
                    'o': env.s.cpu().numpy(),
                    'v': v.data.cpu().numpy(),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy()
                }

                j += 1

        raise StopIteration
Example #3
0
def cartpole():
    env = Env('localhost:32822')
    env.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.box.shape[0]
    action_space = env.action_space.discrete.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        # print(state)
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            # env.render()
            print("acting on state: ", state)
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                plt.plot(dqn_solver.loss)
                plt.title('Model loss')
                plt.ylabel('Loss')
                plt.xlabel('Episode')
                plt.savefig("loss.png")
                break
            dqn_solver.experience_replay()
Example #4
0
def experiment1_test(
    output_folder,
    word_vectors,
    agent,
    episode_index,
    testset_path='./dataset/conll2003/en/eng.testb',
):
    # 初始化环境
    env = Env(testset_path, word_vectors)
    step = 0
    s = env.reset()
    print('[' + util.now_time() + "] start testing...")
    while True:
        # check task is ended
        if env.end():
            print('[' + util.now_time() + "] testing...done")
            result_file = '%03d_episode_test.txt' % (episode_index + 1)
            env.save_all_newlines_to_file(output_folder, result_file)
            return evaluate.conlleval(output_folder, result_file)

        # Choose Action a
        a = agent.choose_action(s)

        # Execute action
        s_, r = env.step(a)

        # Next status
        step += 1
        s = s_
Example #5
0
    def test(self, num_actions):
        self.saver.restore(self.session, FLAGS.checkpoint_path)
        print "Restored model weights from ", FLAGS.checkpoint_path
        monitor_env = gym.make(FLAGS.game)
        monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True)
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)
        
        for i_episode in xrange(FLAGS.num_eval_episodes):
            state = env.get_initial_state()
            episode_reward = 0
            done = False
            
            # create state sequence
            state_sequence = np.zeros((t_max, FLAGS.history_length, FLAGS.width, FLAGS.height))
            state_sequence[t_max -1, :, :, :] = state
            
            while not done:
                monitor_env.render()
                q_values = self.q_values.eval(session = self.session, feed_dict = {self.state : [state_sequence]})
                action_index = np.argmax(q_values)
                new_state, reward, done = env.step(action_index)
                state = new_state

                # update state sequence
                state_sequence = np.delete(state_sequence, 0, 0)
                state_sequence = np.insert(state_sequence, t_max-1, state, 0)
                episode_reward += reward
            print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward)
        
        monitor_env.monitor.close()
Example #6
0
    def play_deterministic(self, n_tot):

        self.model.eval()
        env = Env()
        render = args.render

        n_human = 60
        humans_trajectories = iter(self.data)
        reverse_excitation_index = consts.reverse_excitation_index

        for i in range(n_tot):

            env.reset()

            observation = next(humans_trajectories)
            print("Observation %s" % observation)
            trajectory = self.data[observation]

            j = 0

            while not env.t:

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:

                    if self.cuda:
                        s = Variable(env.s.cuda(), requires_grad=False)
                    else:
                        s = Variable(env.s, requires_grad=False)
                    _, _, beta, _, _, _ = self.model(s)

                    beta = beta.squeeze(0)
                    beta = (beta.sign().int() * (beta.abs() > 0.5).int()).data
                    if self.cuda:
                        beta = beta.cpu().numpy()
                    else:
                        beta = beta.numpy()
                    beta[0] = abs(beta[0])
                    a = reverse_excitation_index[tuple(beta)]

                env.step(a)

                j += 1

            yield {'o': env.s.cpu().numpy(), 'score': env.score}
Example #7
0
def experiment1_train(
    output_folder,
    word_vectors,
    n_episodes=300,
    trainset_path='./dataset/conll2003/en/eng.train',
):
    # 初始化环境
    print('[' + util.now_time() + "] init environment...")
    env = Env(trainset_path, word_vectors)
    print('[' + util.now_time() + "] 环境初始化完毕")

    # 初始化DQN
    print('[' + util.now_time() + "] init agent...")
    agent = DQN(n_actions=env.n_actions,
                status_dim=env.status_dim,
                action_dim=env.action_dim,
                reward_dim=env.reward_dim)
    print('[' + util.now_time() + "] agent初始化完毕")

    # 迭代episodes
    for i in range(n_episodes):
        print('[' + util.now_time() + "] start episode %03d of learning..." %
              (i + 1))
        step = 0
        s = env.reset()

        while True:
            # check task is ended
            if env.end():
                print('[' + util.now_time() +
                      "] episode %03d of learning...done" % (i + 1))
                result_file = '%03d_episode_train.txt' % (i + 1)
                env.save_all_newlines_to_file(output_folder, result_file)
                train_eval = evaluate.conlleval(output_folder, result_file)
                test_eval = experiment1_test(output_folder, word_vectors,
                                             agent, i)
                break

            # Choose Action a
            a = agent.choose_action(s)

            # Execute action
            # print('step %d' % step)
            s_, r = env.step(a)

            agent.store_transition(s, a, r, s_)

            step += 1
            s = s_

            if step > 200 and step % 5 == 0:
                agent.learn()

    # plot and compare train and test set TODO
    # plot(train_evals,test_evals)
    agent.eval_network.save(output_folder + os.path.sep + 'ex1_eval_model',
                            overwrite=True)
Example #8
0
    def test(self, env):

        # initialize environment
        env = Env(env, 84, 84, 4)

        terminal = False
        # Get initial game observation
        state = env.get_initial_state()

        # episode's reward and cost
        episode_reward = 0

        for _ in range(100):
            while not terminal:

                # forward pass of network. Get probability of all actions
                probs, v = self.sess.run((self.policy, self.state_value),
                                         feed_dict={self.input_state: [state]})

                probs = probs[0]
                v = v[0][0]

                if random.random() < 0.01:
                    action_index = random.choice([0, 1, 2, 3])
                else:
                    action_index = np.argmax(probs)

                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, terminal = env.step(action_index)
                env.env.render()
                # clip reward to -1, 1
                # Update the state and global counters
                state = new_state
                # update episode's counter
                episode_reward += reward

            if terminal:

                terminal = False
                print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \
                    episode_reward, "/ COST"
                episode_reward = 0
                counter = 0
                # Get initial game observation
                state = env.get_initial_state()
Example #9
0
    def test(self, num_actions):
        self.saver.restore(self.session, FLAGS.checkpoint_path)
        print "Restored model weights from ", FLAGS.checkpoint_path
        monitor_env = gym.make(FLAGS.game)
        monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True)
        env = Env(monitor_env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)

        for i_episode in xrange(FLAGS.num_eval_episodes):
            state = env.get_initial_state()
            episode_reward = 0
            done = False
            while not done:
                monitor_env.render()
                probs = self.session.run(self.policy_values, feed_dict={self.state: [state]})[0]
                action_index = sample_policy_action(num_actions, probs)
                new_state, reward, done = env.step(action_index)
                state = new_state
                episode_reward += reward
            print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward)
        
        monitor_env.monitor.close()
Example #10
0
    def train(self,
              env,
              checkpoint_interval,
              checkpoint_dir,
              saver,
              gamma=0.99):
        global T
        self.saver = saver

        # initialize environment
        time.sleep(3 * self.thread_id)
        env = Env(env, 84, 84, 4)

        print 'Starting thread ' + str(self.thread_id)

        terminal = False
        # Get initial game observation

        state = env.get_initial_state()

        # episode's reward and cost
        episode_reward = 0
        total_cost = 0
        counter = 0

        while T < self.TMAX:

            # lists for feeding placeholders
            states = []
            actions = []
            prev_reward = []
            state_values = []

            t = 0
            t_start = t
            self.sess.run(self.sync_op)
            while not (terminal or ((t - t_start) == self.tmax)):

                # forward pass of network. Get probability of all actions
                probs, v = self.sess.run((self.policy, self.state_value),
                                         feed_dict={self.input_state: [state]})

                probs = probs[0]
                v = v[0][0]
                # print the outputs of the neural network fpr sanity chack
                if T % 2000 == 0:
                    print probs
                    print v

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([self.output_size])

                # choose action based on policy
                action_index = sample_policy_action(probs)

                action_list[action_index] = 1

                # add state and action to list
                actions.append(action_list)
                states.append(state)

                state_values.append(v)

                # Gym executes action in game environment on behalf of actor-learner
                new_state, reward, terminal = env.step(action_index)

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)
                prev_reward.append(clipped_reward)

                # Update the state and global counters
                state = new_state
                T += 1
                t += 1
                counter += 1

                # update episode's counter
                episode_reward += reward

                # Save model progress
                if T % checkpoint_interval < 200:
                    T += 200
                    self.saver.save(self.sess,
                                    checkpoint_dir + "/breakout.ckpt",
                                    global_step=T)

            if terminal:
                R_t = 0
            else:
                R_t = self.sess.run(self.state_value,
                                    feed_dict={self.input_state: [state]})
                R_t = R_t[0][0]

            state_values.append(R_t)
            targets = np.zeros((t - t_start))

            for i in range(t - t_start - 1, -1, -1):
                R_t = prev_reward[i] + gamma * R_t
                targets[i] = R_t

            # compute the advantage based on GAE
            # code from https://github.com/openai/universe-starter-agent
            delta = np.array(prev_reward) + gamma * np.array(
                state_values[1:]) - np.array(state_values[:-1])
            advantage = scipy.signal.lfilter([1], [1, -gamma],
                                             delta[::-1],
                                             axis=0)[::-1]

            # update the global network
            cost, _ = self.sess.run(
                (self.loss, self.opt),
                feed_dict={
                    self.input_state: states,
                    self.actions: actions,
                    self.targets: targets,
                    self.advantage: advantage
                })
            total_cost += cost

            if terminal:

                terminal = False
                print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \
                    episode_reward, "/ COST", total_cost/counter
                episode_reward = 0
                total_cost = 0
                counter = 0

                # Get initial game observation
                state = env.get_initial_state()
            score = 0
            step = 0

            state = env.reset()
            state = np.reshape(state, [1, state_size])
            while not done:
                # if episode < agent.initial_train_episodes and agent.load_model == False:              # explore in the first episode
                #     if step % 4 < 2:
                #         action_index = random.randrange(6, 11)
                #     else:
                #         action_index = random.randrange(0, 5)
                # else:
                #     action_index = agent.get_action(state)
                action_index = agent.get_action(state)

                next_state, reward, done, info = env.step(action_index)
                next_state = np.reshape(next_state, [1, state_size])
                step += 1

                rad = math.acos(next_state[0][0])
                print(
                    "episode:{0} || step:{1} || action:{2} || pendulum radian:{3} || reward:{4} || done:{5}"
                    .format(
                        episode,
                        step,
                        action_index,
                        # round(next_state[0][0], 4),
                        rad,
                        round(reward, 2),
                        done))
def main():
    rospy.init_node('ddpg_stage_1')
    env = Env(is_training)
    agent = DDPG(env, state_dim, action_dim)
    past_action = np.array([0., 0.])
    print('State Dimensions: ' + str(state_dim))
    print('Action Dimensions: ' + str(action_dim))
    print('Action Max: ' + str(action_linear_max) + ' m/s and ' +
          str(action_angular_max) + ' rad/s')

    if is_training:
        print('Training mode')
        avg_reward_his = []
        total_reward = 0
        var = 1.

        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(np.random.normal(a[0], var), 0., 1.)
                a[1] = np.clip(np.random.normal(a[1], var), -0.5, 0.5)

                state_, r, done, arrive = env.step(a, past_action)
                time_step = agent.perceive(state, a, r, state_, done)

                if arrive:
                    result = 'Success'
                else:
                    result = 'Fail'

                if time_step > 0:
                    total_reward += r

                if time_step % 10000 == 0 and time_step > 0:
                    print(
                        '---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward = ', avg_reward)
                    avg_reward_his.append(round(avg_reward, 2))
                    print('Average Reward:', avg_reward_his)
                    total_reward = 0

                if time_step % 5 == 0 and time_step > exploration_decay_start_step:
                    var *= 0.9999

                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Var: %.2f' % var,
                          '| Time step: %i' % time_step, '|', result)
                    one_round_step = 0

                if done or one_round_step >= 500:
                    print('Step: %3i' % one_round_step, '| Var: %.2f' % var,
                          '| Time step: %i' % time_step, '|', result)
                    break

    else:
        print('Testing mode')
        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, r, done, arrive = env.step(a, past_action)
                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Arrive!!!')
                    one_round_step = 0

                if done:
                    print('Step: %3i' % one_round_step, '| Collision!!!')
                    break
Example #13
0
class Training:
    def __init__(self):
        self.n_episode = []
        self.n_epsilon = []
        self.n_dist = []
        self.avg_err = []
        self.logging_data = []

        # Parameters
        self.n_episodes = rospy.get_param("/n_episodes")
        self.n_step = rospy.get_param("/n_steps")
        self.mode_action = rospy.get_param('/mode_action')
        self.mem_size = rospy.get_param('/mem_size')
        self.batch_size = rospy.get_param('/batch_size')
        self.mode_optimize = rospy.get_param('/mode_optimize')
        self.avg_err_fre = rospy.get_param('/avg_err_fre')
        self.save_fre = rospy.get_param("/save_fre")
        self.load_checkpoint = rospy.get_param("/load_checkpoint")

        # create environment
        self.env = Env()
        self.n_states = self.env.observation_space
        self.n_actions = self.env.action_space.n

        # create Deep Q-Network
        self.dqn = DQN(self.n_states, self.n_actions)
        self.memory = ExperienceReplay(self.mem_size)

        # plot
        self.color1 = 'tab:green'
        self.color2 = 'tab:blue'
        self.color3 = 'tab:orange'
        self.color4 = 'tab:red'

        self.style_plot = random.choice(plt.style.available)
        plt.style.use(self.style_plot)
        plt.ion()

        ###########
        # Figure 1 - Rewards
        self.fig1 = plt.figure(1)
        # fig = plt.figure(figsize=(12,5))
        self.ax1 = self.fig1.add_subplot(1, 1, 1)
        self.ax2 = self.ax1.twinx()

        title_1 = 'Rewards - (Mode: Training)'
        self.ax1.set_title(title_1)
        self.ax1.set_xlabel('Episode')
        self.ax1.set_ylabel('Reward', color=self.color1)
        self.ax2.set_ylabel('Epsilon', color=self.color2)
        self.ax1.tick_params(axis='y', labelcolor=self.color1)
        self.ax2.tick_params(axis='y', labelcolor=self.color2)

        ###########
        # Figure 2 - Error
        self.fig2 = plt.figure(2)
        self.ax3 = self.fig2.add_subplot(1, 1, 1)

        title_2 = 'Error Distance - (Mode: Training)'
        self.ax3.set_title(title_2)
        self.ax3.set_xlabel('Episode')
        self.ax3.set_ylabel('Meter')

        self.init_file()

    def moving_average(self, x, w):
        return np.convolve(x, np.ones(w), 'valid') / w

    def init_file(self):
        rospack = rospkg.RosPack()
        data_path = rospack.get_path("pioneer_dragging") + "/data"
        username = getpass.getuser()
        # n_folder  = len(os.walk(data_path).__next__()[1])
        n_folder = glob("{}/{}*".format(data_path, username))
        n_folder = len(n_folder) + 1

        if self.load_checkpoint:
            n_folder -= 1

        self.data_path = "{}/{}-{}".format(data_path, username, n_folder)
        if not os.path.exists(self.data_path):
            os.mkdir(self.data_path)

        # config file
        if not self.load_checkpoint:
            config_path = rospack.get_path(
                "pioneer_dragging") + "/config/dragging_params.yaml"
            config_log = '{}/{}-params.yaml'.format(self.data_path, n_folder)
            os.system('cp {} {}'.format(config_path, config_log))

            plot_style = {'plot_style': self.style_plot}
            with open(config_log, 'r') as yamlfile:
                cur_yaml = yaml.safe_load(yamlfile)  # Note the safe_load
                cur_yaml.update(plot_style)

            if cur_yaml:
                with open(config_log, 'w') as yamlfile:
                    yaml.safe_dump(cur_yaml,
                                   yamlfile)  # Also note the safe_dump

        # history file
        self.history_log = '{}/{}-log.txt'.format(self.data_path, n_folder)

        # model file
        self.dqn.file_models = '{}/{}-pytorch-RL.tar'.format(
            self.data_path, n_folder)

        # memory file
        self.memory.file_mem = '{}/{}-memory.data'.format(
            self.data_path, n_folder)

        # figures file
        self.figure1 = '{}/{}-Rewards(Training).png'.format(
            self.data_path, n_folder)
        self.figure2 = '{}/{}-Error(Training).png'.format(
            self.data_path, n_folder)

    def plot_result(self,
                    i_episode,
                    cumulated_reward,
                    epsilon,
                    error_dist,
                    loaded=False):
        ### Figure 1
        # plot bar (cumulated reward)
        self.ax1.bar(i_episode, cumulated_reward, color=self.color1)

        # plot line (epsilon decay )
        if loaded:
            self.ax2.plot(i_episode, epsilon, color=self.color2)

            self.n_episode = i_episode.tolist()
            self.n_epsilon = epsilon.tolist()
            self.n_dist = error_dist.tolist()
        else:
            self.n_episode.append(i_episode)
            self.n_epsilon.append(epsilon)
            self.ax2.plot(self.n_episode, self.n_epsilon, color=self.color2)

            self.n_dist.append(error_dist)

        ### Figure 2
        # plot bar (error distance)
        self.ax3.bar(i_episode, error_dist, color=self.color3)

        # window_err = np.array(self.n_dist)
        # window_err = np.mean(window_err)
        # self.avg_err.append(window_err)
        # self.ax3.plot(self.n_episode, self.avg_err, color=self.color4)

        # plot line (average error distance)
        if len(self.n_dist) % self.avg_err_fre == 0:
            avg_err = self.moving_average(np.array(self.n_dist),
                                          self.avg_err_fre)
            self.ax3.plot(avg_err, color=self.color4)

        plt.draw()
        plt.pause(0.1)

    def run(self):
        start_time = time.time()

        if self.load_checkpoint:
            self.memory.load()
            self.dqn.load_model()

            # history log loaded
            self.logging_data = [
                line.rstrip('\n') for line in open(self.history_log)
            ]

            hist_data = pd.read_csv(self.history_log, sep=",")
            i_episode = hist_data['i_episode']
            cumulated_reward = hist_data['cumulated_reward']
            epsilon = hist_data['epsilon']
            error_dist = hist_data['error_dist']

            self.plot_result(i_episode,
                             cumulated_reward,
                             epsilon,
                             error_dist,
                             loaded=True)
            i_episode = hist_data['i_episode'].iloc[-1] + 1
            self.dqn.epsilon = hist_data['epsilon'].iloc[-1]
            rospy.loginfo('[RL] Loaded checkpoint')
        else:
            i_episode = 0

        #########################################
        ###### Reinfrocement Training loop ######
        for i_episode in range(i_episode, self.n_episodes):
            state = self.env.reset(i_episode)
            cumulated_reward = 0

            steps = 0
            step_time = time.time()

            while not rospy.is_shutdown():
                steps += 1
                action, epsilon = self.dqn.select_action(state, i_episode)
                # print('num_steps: {}, epsilon: {}, steps_done: {}'.format(steps, epsilon, dqn.steps_done))

                # action = env.action_space.sample()
                rospy.loginfo('[RL] action: {}'.format(action))

                next_state, reward, done, info = self.env.step(action)
                self.memory.push(state, action, next_state, reward, done)
                cumulated_reward += reward

                ################################
                ######### optimize #############

                if self.mode_optimize == 'normal_dqn':
                    # without experience replay memory
                    self.dqn.optimize(state, action, next_state, reward, done)

                elif self.mode_optimize == 'dqn_replay_memory':
                    # with experience replay memory
                    if len(self.memory) > self.batch_size:
                        state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample(
                            self.batch_size)
                        self.dqn.optimize_with_replay_memory(
                            state_mem, action_mem, next_state_mem, reward_mem,
                            done_mem)

                elif self.mode_optimize == 'dqn_taget_net':
                    # with experience target net
                    if len(self.memory) > self.batch_size:
                        state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample(
                            self.batch_size)
                        self.dqn.optimize_with_DQN(state_mem, action_mem,
                                                   next_state_mem, reward_mem,
                                                   done_mem)

                elif self.mode_optimize == 'dueling_dqn':
                    # with double DQN
                    if len(self.memory) > self.batch_size:
                        state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample(
                            self.batch_size)
                        self.dqn.optimize_with_dueling_DQN(
                            state_mem, action_mem, next_state_mem, reward_mem,
                            done_mem)

                if not done:
                    state = next_state
                else:
                    break

            # DQN update param
            self.dqn.update_param(i_episode)

            # Plotting
            error_dist = self.env.calc_dist()
            self.plot_result(i_episode, cumulated_reward, epsilon, error_dist)

            # Save Checkpoint
            temp_data = "{},{},{},{}".format(i_episode, cumulated_reward,
                                             epsilon, error_dist)
            self.logging_data.append(temp_data)

            if i_episode % self.save_fre == 0:
                rospy.loginfo('[RL] Save checkpoint: {}'.format(i_episode))

                self.dqn.save_model()  # save models
                self.memory.save()  # save replay memory

                # logging file
                with open(self.history_log, 'w') as f:
                    if not self.load_checkpoint:
                        f.write(
                            "i_episode,cumulated_reward,epsilon,error_dist\n")

                    for item in self.logging_data:
                        f.write("%s\n" % item)

                # save figures
                self.fig1.savefig(self.figure1, dpi=self.fig1.dpi)
                self.fig2.savefig(self.figure2, dpi=self.fig2.dpi)
                rospy.loginfo('[RL] Save figure1: {}'.format(self.figure1))
                rospy.loginfo('[RL] Save figure2: {}'.format(self.figure2))

            # Timing
            elapsed_time = time.time() - step_time
            total_time = time.time() - start_time
            print('\n********')
            print("Elapsed time: {}".format(
                time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))
            print("Total time: {}".format(
                time.strftime("%H:%M:%S", time.gmtime(total_time))))

        # Finish Training
        self.env.close()
        print()
        rospy.loginfo('[RL] Exit ...')

        total_time = time.time() - start_time
        print('\n*********************')
        print("Total time: ", time.strftime("%H:%M:%S",
                                            time.gmtime(total_time)))

        rospy.loginfo('[RL] Style plot: {}'.format(self.style_plot))
        plt.show(block=True)
Example #14
0
        his_hdqn, his_sto = [], []
        his_gtd, his_gcd, his_gac = [], [], []

        state = env.init()
        while True:
            # 1. DQN
            # action = dqn.choose_action(state)   # RL choose action based on state
            # next_state, reward, done, env_data = env.step(state, action)    # RL take action and get next observation and reward
            # dqn.store_transition(state, action, reward, next_state)
            # if dqn.memory_counter > dqn.memory_size and step % 5 == 0:
            #     dqn.learn()

            # HDQN
            goal = hdqn.ac_agent.choose_goal(state)
            action = hdqn.ts_agent.choose_action(state, [goal])
            next_state, reward, done, env_data = env.step(state, action)

            # 2.Stochastic
            a_sto = sto.choose_action()
            r_sto, env_sto = env.step(state, a_sto, is_update=False)
            # 3.Greedy Trans Delay
            a_gtd = greed.choose_action(state, trans_delay=True)
            r_gtd, env_gtd = env.step(state, a_gtd, is_update=False)
            # 4.Greedy Calcul Delay
            a_gcd = greed.choose_action(state, cal_delay=True)
            r_gcd, env_gcd = env.step(state, a_gcd, is_update=False)
            # 5.Greedy Accuracy
            a_gac = greed.choose_action(state, accuracy=True)
            r_gac, env_gac = env.step(state, a_gac, is_update=False)

            # history record
Example #15
0
    def actor_learner_thread(self, env, thread_id, num_actions):

        # create instance of Doom environment
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)
   
        print 'Starting thread ' + str(thread_id) 
        time.sleep(3*thread_id)
        
        # Get initial game observation
        state = env.get_initial_state()

        # episode's counter
        episode_reward = 0
        counter = 0

        while self.T < self.TMAX:
                    
            done = False
            
            # clear gradients
            states = []
            actions = []
            prev_reward = []

            t = 0
            t_start = t
            
            # synchronize policy and value network
            self.session.run(self.update_policy[thread_id])
            self.session.run(self.update_value[thread_id])
            
            while not (done or ((t - t_start)  == t_max)):
                
                # forward pass of network. Get probability of all actions
                probs = self.session.run(self.local_policy[thread_id], feed_dict={self.local_states[thread_id]: [state]})[0]

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([num_actions])

                # choose action based on policy
                action_index = sample_policy_action(num_actions, probs)
                action_list[action_index] = 1

                # add state and action to list
                actions.append(action_list)
                states.append(state)
                
                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, done = env.step(action_index)

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)
                prev_reward.append(clipped_reward)

                # Update the state and global counters
                state = new_state
                self.T += 1
                t += 1
                counter += 1
                # update episode's counter
                episode_reward += reward
    
    
                # Save model progress
                if counter % FLAGS.checkpoint_interval == 0:
                    if FLAGS.game_type == 'Doom':
                        self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game.split("/")[1] + ".ckpt" , global_step = counter)
                    else:
                        self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game + ".ckpt" , global_step = counter)

            if done:
                R_t = 0
            else:
                R_t = self.session.run(self.local_value[thread_id], feed_dict = {self.local_states[thread_id] : [state]})[0][0]

            targets = np.zeros((t - t_start))
                
            for i in range(t - t_start -1 , -1, -1):
                R_t = prev_reward[i] + FLAGS.gamma * R_t
                targets[i] = R_t

            #update q value network
            self.session.run(self.grad_update, feed_dict = {self.state: states,
                                                          self.actions: actions,
                                                          self.targets: targets})
                
            if done:
                print "THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", counter, "/ REWARD", episode_reward
                episode_reward = 0
                
                # Get initial game observation
                state = env.get_initial_state()
Example #16
0
from environment import Env
import pandas as pd
import random

data = pd.read_csv("data/target.csv")["Y"]

env = Env(data)

done = False
while (done == False):
    price = env.getState()
    print(price)
    action = random.randint(0, 3)  # 0 == buy, 1 == sell, 2 == do nothing
    _, done = env.step(action)
    env.render()
Example #17
0
    def play_episode(self, n_tot):

        self.beta_net.eval()
        self.beta_target.eval()

        self.pi_net.eval()
        self.pi_target.eval()

        self.vb_net.eval()
        self.vb_target.eval()

        self.q_net.eval()
        self.q_target.eval()

        self.qb_net.eval()
        self.qb_target.eval()

        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)
            mask = Variable(torch.FloatTensor(
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]),
                            requires_grad=False).cuda()
            j = 0
            temp = 1

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                beta, phi = self.beta_net(s)
                pi, _ = self.pi_net(s)
                q, _ = self.q_net(s)
                vb, _ = self.vb_net(s)

                pi = beta.squeeze(0)
                self.greedy = False

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    # eps = np.random.rand()
                    eps = 1
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.01:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                q = q[0, a]
                q = q.squeeze(0)

                env.step(a)

                yield {
                    'o': env.s.cpu().numpy(),
                    'v': vb.squeeze(0).data.cpu().numpy(),
                    'vb': vb.squeeze(0).data.cpu().numpy(),
                    'qb': q.squeeze(0).data.cpu().numpy(),
                    # 's': x[0, :512].data.cpu().numpy(),
                    'score': env.score,
                    'beta': pi.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'q': q.squeeze(0).data.cpu().numpy()
                }

                j += 1

        raise StopIteration
Example #18
0
def main():
    rospy.init_node('ddpg_stage_1')
    env = Env(is_training)
    agent = DDPG(env, state_dim, action_dim)

    # import ipdb
    # ipdb.set_trace()

    past_action = np.array([0., 0.])
    print('State Dimensions: ' + str(state_dim))
    print('Action Dimensions: ' + str(action_dim))
    print('Action Max: ' + str(action_linear_max) + ' m/s and ' +
          str(action_angular_max) + ' rad/s')
    print('Action Min: ' + str(action_linear_min) + ' m/s and ' +
          str(action_angular_min) + ' rad/s')

    #########################################################################################
    #                                 Training
    #########################################################################################
    if is_training:
        print('Training mode')
        avg_reward_his = []
        total_reward = 0
        action_var = 0.2
        success_rate = 0

        # Log path setting
        now = datetime.datetime.now()
        logdir = now.strftime('%Y-%M-%d') + '_' + now.strftime('%H-%M')
        logdir = os.path.join(log_dir, logdir)
        # tb_writer = SummaryWriter(logdir)

        # Start training
        start_time = time.time()
        for itr in range(10000):
            state = env.reset()

            # episode_reward = 0.0
            # For each episode
            for cur_step in range(max_episode_length):
                action = agent.action(state)
                action[0] = np.clip(np.random.normal(action[0], action_var),
                                    action_linear_min, action_linear_max)
                action[1] = np.clip(np.random.normal(action[1], action_var),
                                    action_angular_min, action_angular_max)

                state_, reward, done, arrive = env.step(action, past_action)
                time_step = agent.perceive(state, action, reward, state_, done)

                ########################################################################################
                #                                   debugging environment
                ########################################################################################
                if is_debugging:
                    print('cur_step: {}'.format(cur_step))
                    print('action: {}'.format(action))
                    print('goal position: x:{}, y:{}'.format(
                        env.goal_position.position.x,
                        env.goal_position.position.y))
                    print('r: {}, done: {}, arrive: {}'.format(
                        reward, done, arrive))
                ########################################################################################

                result = 'Success' if arrive else 'Fail'

                if time_step > 0:
                    total_reward += reward

                if time_step % 10000 == 0 and time_step > 0:
                    print(
                        '---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward: {}'.format(avg_reward))
                    avg_reward_his.append(round(avg_reward, 2))
                    # writer.add_scalar('avg_reward', avg_reward, time_step)
                    print('Overall average Reward: {}'.format(avg_reward_his))
                    total_reward = 0

                if time_step % 5 == 0 and time_step > exploration_decay_start_step:
                    action_var *= 0.9999

                past_action = action
                state = state_

                if arrive or done or cur_step >= max_episode_length:
                    if result == 'Success':
                        success_rate += 1
                    sec = time.time() - start_time
                    elapsed_time = str(
                        datetime.timedelta(seconds=sec)).split('.')[0]
                    print(
                        'Num_episode: {}, Full steps: {}, Result: {}, Elapsed time: {}'
                        .format(itr, cur_step, result, elapsed_time))

                    if itr % 20 == 0 and itr > 0:
                        print('Total: {}/20, Success rate: {}'.format(
                            success_rate, round(success_rate / 20), 2))
                        success_rate = 0

                    break


#########################################################################################
#                                 Testing
#########################################################################################
    else:
        print('Testing mode')
        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, reward, done, arrive = env.step(a, past_action)
                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Arrive!!!')
                    one_round_step = 0

                if done:
                    print('Step: %3i' % one_round_step, '| Collision!!!')
                    break
Example #19
0
def main():
    expert_demo = pickle.load(open('./Ree1_expert.p', "rb"))
    # Ree1 : action 1
    # Ree2 : action 100
    # Ree3 : action 50
    # Ree4 : action 10
    # Ree5 : action 4
    # Ree6 : action 0.5

    # print('expert_demo_shape : ', np.array(expert_demo).shape)
    expert_x = int(expert_demo[1][0])
    expert_y = int(expert_demo[1][1])
    env = Env(expert_x, expert_y)
    # env = Env(0,0)

    # env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = 2
    num_actions = 8
    running_state = ZFilter((num_inputs, ), clip=5)

    print('state size:', num_inputs)
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.learning_rate,
                              weight_decay=args.l2_rate)
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations
    # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))

    demonstrations = np.array(expert_demo[0])

    # print("demonstrations.shape", demonstrations.shape)

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size:
            state = env.reset()
            score = 0

            state = running_state(state)

            for _ in range(1000):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)
                # next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break

            episodes += 1
            scores.append(score)

        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes,
                                                       score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory,
                                                    discrim_optim,
                                                    demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" %
                  (expert_acc * 100, learner_acc * 100))

            temp_learner.append(learner_acc * 100)
            temp_expert.append(expert_acc * 100)

            if ((expert_acc > args.suspend_accu_exp
                 and learner_acc > args.suspend_accu_gen and iter % 55 == 0)
                    or iter % 50 == 0):
                # train_discrim_flag = False
                plt.plot(temp_learner, label='learner')
                plt.plot(temp_expert, label='expert')
                plt.xlabel('Episode')
                plt.ylabel('Accuracy')
                plt.xticks([])
                plt.legend()
                plt.savefig('accuracy{}.png'.format(iter))
                # plt.show()

                model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
                ckpt_path = os.path.join(model_path,
                                         'ckpt_' + str(score_avg) + '.pth.tar')

                print("check path", ckpt_path)
                save_checkpoint(
                    {
                        'actor': actor.state_dict(),
                        'critic': critic.state_dict(),
                        'discrim': discrim.state_dict(),
                        'z_filter_n': running_state.rs.n,
                        'z_filter_m': running_state.rs.mean,
                        'z_filter_s': running_state.rs.sum_square,
                        'args': args,
                        'score': score_avg
                    },
                    filename=ckpt_path)

        train_actor_critic(actor, critic, memory, actor_optim, critic_optim,
                           args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail'
            ckpt_path = os.path.join(model_path,
                                     'ckpt_' + str(score_avg) + '.pth.tar')

            save_checkpoint(
                {
                    'actor': actor.state_dict(),
                    'critic': critic.state_dict(),
                    'discrim': discrim.state_dict(),
                    'z_filter_n': running_state.rs.n,
                    'z_filter_m': running_state.rs.mean,
                    'z_filter_s': running_state.rs.sum_square,
                    'args': args,
                    'score': score_avg
                },
                filename=ckpt_path)
    plt.plot(temp_learner)
    plt.plot(temp_expert)
    plt.xlabel('Episode')
    plt.ylabel('Accuracy')
    plt.xticks([])
    plt.savefig('accuracy.png')
Example #20
0
class DDPGStage:
    def __init__(self, model, is_training=False, var=1.):
        self.max_step = 200
        self.exploration_decay_start_step = 50000
        state_dim = 366
        action_dim = 2
        self.action_linear_max = 0.25  # m/s
        self.action_angular_max = 0.5  # rad/s
        rospy.init_node('ddpg_stage_1')
        rospy.on_shutdown(self.clear_vel)
        self.is_training = is_training
        if ['/gazebo/model_states', 'gazebo_msgs/ModelStates'] in rospy.get_published_topics():
            self.env = SimEnv(self.is_training)
            print("Gazebo mode")
        else:
            self.env = Env(self.is_training)
            print("Real world mode")

        self.agent = DDPG(model, self.env, state_dim, action_dim)
        self.past_action = np.array([0., 0.])
        print('State Dimensions: ' + str(state_dim))
        print('Action Dimensions: ' + str(action_dim))
        print('Action Max: ' + str(self.action_linear_max) + ' m/s and ' + str(self.action_angular_max) + ' rad/s')

        self.var = var

    def _train(self):
        print('Training mode')
        avg_reward_his = []
        total_reward = 0

        while not rospy.is_shutdown():
            state = self.env.reset()
            one_round_step = 0

            while not rospy.is_shutdown():
                a = self.agent.action(state)
                a[0] = np.clip(np.random.normal(a[0], self.var), 0., 1.)
                a[1] = np.clip(np.random.normal(a[1], self.var), -0.5, 0.5)

                state_, r, collision, arrive = self.env.step(a)
                time_step = self.agent.perceive(state, a, r, state_, collision)

                if time_step > 0:
                    total_reward += r

                if time_step % 10000 == 0 and time_step > 0:
                    print('---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward = ', avg_reward)
                    avg_reward_his.append(round(avg_reward, 2))
                    print('Average Reward:', avg_reward_his)
                    total_reward = 0

                if time_step % 5 == 0 and time_step > self.exploration_decay_start_step and self.var > 0.1:
                    self.var *= 0.9999

                state = state_
                one_round_step += 1

                plt.title("STEP %d, Reward: %.2f" % (one_round_step, r))
                result = 'Step: %3i | Reward: %.2f | Var: %.2f | Time step: %i |' % (one_round_step, r, self.var, time_step)
                if arrive:
                    print(result, 'Success')
                    one_round_step = 0
                    self.env.common_reset()
                elif collision:
                    print(result, 'Collision')
                    break
                elif one_round_step >= self.max_step:
                    print(result, 'Failed')
                    break

    def _evaluate(self):
        print('Testing mode')
        self.env.goal_range["x"] = [-1, 1]
        self.env.goal_range["y"] = [-1, 1]
        while not rospy.is_shutdown():
            state = self.env.reset()
            one_round_step = 0

            while not rospy.is_shutdown():

                a = self.agent.action(state)
                print("action: %s" % a)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, r, collision, arrive = self.env.step(a)
                state = state_
                one_round_step += 1

                plt.title("STEP %d, Reward: %.2f" % (one_round_step, r))
                result = 'Step: %3i | Reward: %.2f | Var: %.2f |' % (
                one_round_step, r, self.var)
                if arrive:
                    print(result, 'Success')
                    one_round_step = 0
                    self.env.common_reset()
                    # input()
                elif collision:
                    print(result, 'Collision')
                    break
                elif one_round_step >= self.max_step:
                    print(result, 'Failed')
                    break

    def run(self):
        # try:
        if self.is_training:
            self._train()
        else:
            self._evaluate()
        self.env.pub_cmd_vel.publish(Twist())

    def clear_vel(self):
        self.env.pub_cmd_vel.publish(Twist())
Example #21
0
    def play_episode(self, n_tot):

        self.model.eval()
        self.model_b.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)
            mask = Variable(torch.FloatTensor([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                             requires_grad=False).cuda()
            j = 0
            temp = 1

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                beta, vb, qb, _, _ = self.model_b(s, self.actions_matrix)
                pi, v, q, adv, x = self.model(s, self.actions_matrix, beta.detach())

                pi = pi.squeeze(0)
                self.greedy = False

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi/temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                q = q[0, a, 0]
                q = q.squeeze(0)

                qb = qb[0, a, 0]
                qb = qb.squeeze(0)

                env.step(a)

                yield {'o': env.s.cpu().numpy(),
                       'v': v.squeeze(0).data.cpu().numpy(),
                       'vb': vb.squeeze(0).data.cpu().numpy(),
                       'qb': qb.squeeze(0).data.cpu().numpy(),
                       's': x[0, :512].data.cpu().numpy(),
                       'score': env.score,
                       'beta': pi.data.cpu().numpy(),
                       'phi': x[0, :512].data.cpu().numpy(),
                       'q': q.squeeze(0).data.cpu().numpy()}

                j += 1

        raise StopIteration
Example #22
0
    for i_episode in range(default_config["max_iteration"]):
        attack_mode = random.randint(0, 6)
        state_new = env.reset(attack_mode)
        agent.update_current_channel(state_new)
        done = False


        for t in range(default_config["max_episode_length"]):
            # Get current channel
            x = np.zeros(default_config["max_channel"])
            x[agent.cur_channel] = 1
            # Put into the NN
            action_c = agent.c_policy.select_action(x).cpu().detach().numpy()[0]
            action_s = agent.s_policy.select_action(x).cpu().detach().numpy()[0]
            # print(int(action_c), " ", int(action_s))
            state_new, reward, done, info = env.step(int(action_c), int(action_s))
            agent.update_current_channel(state_new)
            reward_sum += reward

            agent.c_policy.rewards.append(reward)
            agent.s_policy.rewards.append(reward)


            if done:
                # tracking log
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print('REINFORCE ep %03d done. reward: %f. reward running mean: %f' % (i_episode, reward_sum, running_reward))
                if i_episode % default_config["log_freq"] == 0:
                    with open(filename, 'a') as file_object:
                        file_object.write('REINFORCE ep %d done. reward: %f. reward running mean: %f\n' % (i_episode, reward_sum, running_reward))
                        file_object.close()
Example #23
0
    def actor_learner_thread(self, env, thread_id, num_actions):

        # create instance of Doom environment
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)

        # Initialize network gradients
        states = []
        actions = []
        targets = []

        initial_epsilon = 1
        epsilon = 1
        final_epsilon = self.sample_final_epsilon()
        print('Starting thread ' + str(thread_id) + ' with final epsilon ' + str(final_epsilon))

        time.sleep(3 * thread_id)
        t = 0

        while self.T < self.TMAX:

            # Get initial game observation
            state = env.get_initial_state()
            done = False

            # episode's counter
            episode_reward = 0
            mean_q = 0
            frames = 0

            while not done:
                # forward pass of network. Get Q(s,a)
                q_values = self.q_values.eval(session=self.session, feed_dict={self.state: [state]})

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([num_actions])

                action_index = 0

                # chose action based on current policy
                if random.random() <= epsilon:
                    action_index = random.randrange(num_actions)
                else:
                    action_index = np.argmax(q_values)
                action_list[action_index] = 1

                # reduce epsilon
                if epsilon > final_epsilon:
                    epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps

                # decrease learning rate
                if self.lr > 0:
                    self.lr -= FLAGS.learning_rate / self.TMAX

                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, done = env.step(action_index)

                # forward pass of target network. Get Q(s',a)
                target_q_values = self.target_q_values.eval(session=self.session,
                                                            feed_dict={self.new_state: [new_state]})

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)

                # compute targets based on Q-learning update rule
                # targets = r + gamma*max(Q(s',a))
                if done:
                    targets.append(clipped_reward)
                else:
                    targets.append(clipped_reward + FLAGS.gamma * np.max(target_q_values))

                actions.append(action_list)
                states.append(state)

                # Update the state and global counters
                state = new_state
                self.T += 1
                t += 1

                # update episode's counter
                frames += 1
                episode_reward += reward
                mean_q += np.max(q_values)

                # update_target_network
                if self.T % FLAGS.target_network_update_frequency == 0:
                    self.session.run(self.update_target)

                # train online network
                if t % FLAGS.network_update_frequency == 0 or done:
                    if states:
                        self.session.run(self.grad_update, feed_dict={self.state: states,
                                                                      self.actions: actions,
                                                                      self.targets: targets,
                                                                      self.learning_rate: self.lr})
                    # Clear gradients
                    states = []
                    actions = []
                    targets = []

                # Save model progress
                if t % FLAGS.checkpoint_interval == 0:
                    if FLAGS.game_type == 'Doom':
                        self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game.split("/")[1] + ".ckpt",
                                        global_step=t)
                    else:
                        self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game + ".ckpt", global_step=t)

                # Print end of episode stats
                if done:
                    print("THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD",
                          episode_reward, "/ Q_MAX %.4f" % (mean_q / float(frames)), "/ EPSILON PROGRESS",
                          t / float(FLAGS.anneal_epsilon_timesteps))
                    break
Example #24
0
N = 20
env = Env(dt=np.pi / N)

RL = PolicyGradient(
    n_actions=env.n_actions,
    n_features=env.n_states,
    learning_rate=0.002,
    reward_decay=0.99,
)

fid_10 = 0
ep_max = 500
for episode in range(ep_max):

    observation = env.reset()

    for ii in range(N):

        action = RL.choose_action(observation)
        observation_, reward, done, fid = env.step(action)

        RL.store_transition(observation, action, reward)
        observation = observation_
        if done:
            if episode >= ep_max - 11:
                fid_10 = max(fid_10, fid)
            break

    RL.learn()

print('Final_fidelity=', fid_10)
    def play_episode(self, n_tot):

        self.model.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        # mask = torch.FloatTensor(consts.actions_mask[args.game])
        # mask = Variable(mask.cuda(), requires_grad=False)

        vsx = torch.FloatTensor(consts.short_bins[args.game])
        vlx = torch.FloatTensor(consts.long_bins[args.game])

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                    s, self.actions_matrix)
                beta = beta.squeeze(0)
                pi_l = pi_l.squeeze(0)
                pi_s = pi_s.squeeze(0)
                pi_l_tau = pi_l_tau.squeeze(0)
                pi_s_tau = pi_s_tau.squeeze(0)

                temp = 1

                # consider only 3 most frequent actions
                beta_np = beta.data.cpu().numpy()
                indices = np.argsort(beta_np)

                maskb = Variable(torch.FloatTensor(
                    [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                                 requires_grad=False).cuda()
                # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                #                  requires_grad=False).cuda()

                # pi = maskb * (beta / beta.max())

                pi = beta
                self.greedy = False

                beta_prob = pi

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                vs = softmax(vs)
                vl = softmax(vl)
                vs = torch.sum(vsx * vs.data.cpu())
                vl = torch.sum(vlx * vl.data.cpu())

                yield {
                    'o': env.s.cpu().numpy(),
                    'vs': np.array([vs]),
                    'vl': np.array([vl]),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta_prob.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'qs': qs.squeeze(0).data.cpu().numpy(),
                    'ql': ql.squeeze(0).data.cpu().numpy(),
                }

                j += 1

        raise StopIteration
    global_step = 0
    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        # fresh env
        state = env.reset()
        state = np.reshape(state, [1, 15])

        while not done:
            global_step += 1
            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            next_state = np.reshape(next_state, [1, 15])

            agent.append_sample(state, action, reward)
            score += reward
            state = copy.deepcopy(next_state)

            if done:
                # update policy neural network for each episode
                agent.train_model()
                scores.append(score)
                episodes.append(e)
                score = round(score, 2)
                print("episode:", e, "  score:", score, "  time_step:",
                      global_step)
Example #27
0
for i in range(num_episodes):
    print("Episode {} of {}".format(i + 1, num_episodes))
    eps *= decay_factor
    r_sum = 0
    done = False
    diag_action = 0
    diag_reward = 0
    state = env.reset((i, num_episodes))
    while not done:
        env.reset((i, num_episodes))
        rand = np.random.random()
        if rand < eps:
            action = np.random.randint(0, 2)
        else:
            action = np.argmax(model.predict(np.identity(10)[state:state + 1]))
        new_s, r, done, _ = env.step(action=action, num=(i, num_episodes))
        target = r + y * np.max(model.predict(
            np.identity(10)[new_s:new_s + 1]))
        target_vec = model.predict(np.identity(10)[state:state + 1])[0]
        target_vec[action] = target
        model.fit(np.identity(10)[state:state + 1],
                  target_vec.reshape(-1, 2),
                  epochs=1,
                  verbose=0)
        state = new_s
        r_sum += r
        print('Action: {}, Reward: {}'.format(action, r))
        file.write('Action: {}, Reward: {}'.format(action, round(r, 2)))
        diag_action += action
        diag_reward += r
    r_avg_list.append(r_sum)
Example #28
0
                        #     add.append(i)


                        action = []
                        for dim in range(2):
                            action.append(int(env.cars_posit[dic_state[2][num][dim][3]]))
                        dic_action[2].append(action)

            draw_action = [0 for l in range(len(env.cars_posit))]
            for x in dic_state:
                for num in range(len(dic_state[x])):
                    for dim in range(len(dic_state[x][num])):
                        draw_action[dic_state[x][num][dim][3]] = dic_action[x][num][dim]
            draw.piant(env.cars_posit,env.road_range,ax1,env.frame_slot,draw_action)

            dic_state_, dic_reward = env.step(dic_action, tools)
            print(dic_reward)

            for x in dic_reward:
                for num in range(len(dic_reward[x])):
                    for dim in range(x):
                        suss += dic_reward[x][num][dim]
                        total += env.beam_slot
            print('成功率',suss/total)

            dic_state = dic_state_

            success += suss
            totally += total
            zongzhou.append(success/totally)
Example #29
0
    for i_episode in range(default_config["max_iteration"]):
        attack_mode = random.randint(0, 6)
        state_new = env.reset(attack_mode)
        agent.update_current_channel(state_new)
        done = False


        for t in range(default_config["max_episode_length"]):
            # Get current channel
            x = np.zeros(default_config["max_channel"])
            x[agent.cur_channel] = 1
            # Put into the NN
            action_c = agent.c_policy.select_action(x).cpu().detach().numpy()[0]
            action_s = agent.s_policy.select_action(x).cpu().detach().numpy()[0]
            state_new, reward, done, info = env.step(action_c, action_s)
            agent.update_current_channel(state_new)
            reward_sum += reward

            agent.c_policy.rewards.append(reward)
            agent.s_policy.rewards.append(reward)


            if done:
                # tracking log
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print('REINFORCE POLICY GRADIENT WITH BASELINE ep %03d done. reward: %f. reward running mean: %f' % (i_episode, reward_sum, running_reward))
                if i_episode % default_config["log_freq"] == 0:
                    with open(filename, 'a') as file_object:
                        file_object.write('REINFORCE POLICY GRADIENT WITH BASELINE ep %d done. reward: %f. reward running mean: %f\n' % (i_episode, reward_sum, running_reward))
                        file_object.close()
Example #30
0
    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, 20])

        while not done:
            # fresh env
            if agent.render:
                env.render()
            global_step += 1

            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            next_state = np.reshape(next_state, [1, 20])

            agent.replay_memory(state, action, reward, next_state, done)
            # every time step we do training
            agent.train_replay()
            score += reward

            state = copy.deepcopy(next_state)

            # every 100 time steps update the target model to be same with model
            if global_step % 100 == 0:
                agent.update_target_model()

            if done:
                scores.append(score)
Example #31
0
    def play(self, n_tot, action_offset, player):

        self.beta_net.eval()
        self.beta_target.eval()

        self.pi_net.eval()
        self.pi_target.eval()

        self.vb_net.eval()
        self.vb_target.eval()

        self.q_net.eval()
        self.q_target.eval()

        self.qb_net.eval()
        self.qb_target.eval()

        env = Env(action_offset)

        n_human = 90

        episodes = list(self.data.keys())
        random.shuffle(episodes)
        humans_trajectories = iter(episodes)

        for i in range(n_tot):

            env.reset()
            trajectory = self.data[next(humans_trajectories)]
            choices = np.arange(self.global_action_space, dtype=np.int)
            random_choices = self.mask_q.data.cpu().numpy()
            random_choices = random_choices / random_choices.sum()

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                if player is 'beta':
                    pi, _ = self.beta_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = False

                elif player is 'q_b':
                    pi, _ = self.qb_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = True

                elif player is 'pi':
                    pi, _ = self.pi_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = False

                elif player is 'q_pi':
                    pi, _ = self.q_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = True

                else:
                    raise NotImplementedError

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # eps = 1
                    # a = np.random.choice(choices)
                    if self.greedy:
                        if eps > 0.01:
                            a = (pi * self.mask_q).data.cpu().numpy()
                            a = np.argmax(a)
                        else:
                            a = np.random.choice(choices, p=random_choices)
                    else:
                        a = F.softmax(pi + self.mask_beta,
                                      dim=0).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                j += 1

            yield {'score': env.score, 'frames': j}

        raise StopIteration