Exemple #1
0
def run(**kwargs):
    '''
    Setup TF, gym environment, etc.
    '''

    iterations=kwargs['iterations']
    discount=kwargs['discount']
    batch_size=kwargs['batch_size']
    num_batches=kwargs['num_batches']
    max_seq_length=kwargs['max_seq_length']
    learning_rate=kwargs['learning_rate']
    animate=kwargs['animate']
    logdir=kwargs['logdir']
    seed=kwargs['seed']
    games_played_per_epoch=kwargs['games_played_per_epoch']
    load_model = False
    mcts_iterations=kwargs['mcts_iterations']
    batches_per_epoch=kwargs['batches_per_epoch']
    headless=kwargs['headless']
    update_freq=kwargs['update_freq']
    buffer_size=kwargs['buffer_size']

    if headless:
        import matplotlib

    ################################################################
    # SEEDS
    ################################################################
    tf.set_random_seed(seed)
    np.random.seed(seed)

    
    ################################################################
    # SETUP GYM + RL ALGO
    ################################################################
    env = gym.make('snake-v0') # Make the gym environment
    maximum_number_of_steps = max_seq_length #or env.max_episode_steps # Maximum length for episodes
   

    ################################################################
    # TF BOILERPLATE
    ################################################################

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
    sess = tf.Session(config=tf_config)

    summary_writers = []
    for idx in np.arange(env.n_actors):
        summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','snake_%s' % idx) ))

    summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','training_stats') ))    

    def rgb2gray(rgb):
        return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

    with tf.Session() as sess:
        network = DQN( 
                     sess,
                     create_basic([16,16,64], transpose=True),
                     [1,env.world.screen_width,env.world.screen_height], 
                     summary_writers[-1],
                     n_actions=4, 
                     batch_size=batch_size,
                     gamma=.99,
                     update_freq=update_freq,
                     ddqn=True, # double dqn
                     buffer_size = buffer_size,
                     clip_grad = None,
                     batches_per_epoch = batches_per_epoch,
                     is_sparse = False
                     )

        monitor = Monitor(os.path.join(logdir,'gifs'))
        epsilon_schedule = LinearSchedule(iterations*9/10, 1.0, 0.01)
        learning_rate_schedule = PiecewiseSchedule([(0,1e-3),(20000,5e-4),(50000,1e-4)], outside_value=1e-4)

        saver = tf.train.Saver(max_to_keep=2)
        # summary_writer = tf.summary.FileWriter(logdir) 

        ## Load model from where you left off
        ## Does not play nice w/ plots in tensorboard at the moment
        ## TODO: FIX
        if load_model == True:
            try:
                print ('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(logdir)
                saver.restore(sess,ckpt.model_checkpoint_path)
                iteration_offset = int(ckpt.model_checkpoint_path.split('-')[-1].split('.')[0])
            except:
                print ('Failed to load. Starting from scratch')
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())
                iteration_offset = 0   
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            iteration_offset = 0

        summary_writers[0].add_graph(sess.graph)

        ################################################################
        # Fill Buffer
        ################################################################

        tic = time.time()
        total_timesteps = 0

        while not network.buffer.full(N=buffer_size/2):
            network.buffer.games_played += 1
            print 'Game number: %s. Buffer_size: %s' % (network.buffer.games_played, network.buffer.buffer_size)
            _ = env.reset()
            obs = env.render('rgb_array', headless = headless).astype(float)
            obs /= obs.max()
            obs = rgb2gray(obs)

            done_n = np.array([False]*env.n_actors)
            steps = 0
            while not done_n.all():
                last_obs = obs
                acts = network.greedy_select([[last_obs]], 1.) 
                acts = [str(x) for x in acts]
      
                # Next step
                _, reward_n, done_n = env.step(acts[-1])
                obs = env.render('rgb_array', headless = headless).astype(float)
                obs /= obs.max()
                obs = rgb2gray(obs)

                steps += 1

                network.store(np.array([[last_obs]]), # state
                                  np.array(acts), # action
                                  np.array(reward_n), #rewards
                                  np.array([[obs]]), #new state
                                  np.array(done_n) #done
                                  )

                if steps > maximum_number_of_steps:
                    done_n[:] = True

        print 'Filled Buffer'

        ################################################################
        # Train Loop
        ################################################################
        network.buffer.soft_reset()
        total_number_of_steps_in_iteration = 0

        for iteration in range(iteration_offset, iteration_offset + iterations):
            print('{0} Iteration {1} {0}'.format('*'*10, iteration))
            timesteps_in_iteration = 0

            if (iteration % update_freq == 0):
                saver.save(sess,os.path.join(logdir,'model-'+str(iteration)+'.cptk'))
                print "Saved Model. Timestep count: %s" % iteration

            total_reward = np.array([0]*env.n_actors)

            while True:
                network.buffer.games_played += 1
                if (((network.buffer.games_played) % 10) == 0):
                    print 'Epoch: %s. Game number: %s' % (iteration, network.buffer.games_played)
                _ = env.reset()
                rgb = obs = env.render('rgb_array', headless = headless).astype(float)
                obs /= obs.max()
                obs = rgb2gray(obs)

                animate_episode = (iteration % (update_freq) == 0) and animate

                done_n = np.array([False]*env.n_actors)
                steps = 0
                
                # Runs policy, collects observations and rewards
                viewer = None

                while not done_n.all():

                    if animate_episode:
                        if (not viewer) and (not headless):
                            from gym.envs.classic_control import rendering
                            viewer = rendering.SimpleImageViewer()

                        rgb = env.render('rgb_array', headless = headless)
                        scaler = 10
                        rgb=repeat_upsample(rgb,scaler,scaler)

                        if not headless:
                            
                            viewer.imshow(rgb)
                            time.sleep(.01)

                        monitor.add(rgb, iteration, network.buffer.games_played)

                    
                    # ob = get_data(np.array(raw_observations)[-2:])
                    last_obs = obs

                    # Control the exploration
                    acts = network.greedy_select([[last_obs]], epsilon_schedule.value(network.epoch)) # epsilon greedy

                    acts = [str(x) for x in acts]
          
                    # Next step
                    _, reward_n, done_n = env.step(acts[-1])
                    obs = env.render('rgb_array', headless = headless).astype(float)
                    obs /= obs.max()
                    obs = rgb2gray(obs)

                    total_reward += np.array(reward_n)

                    if total_number_of_steps_in_iteration % 4 == 0:
                        network.train_step(learning_rate_schedule)
                    
                    total_number_of_steps_in_iteration += 1
                    steps += 1

                    network.store(np.array([[last_obs]]), # state
                                  np.array(acts), # action
                                  np.array(reward_n), #rewards
                                  np.array([[obs]]), #new state
                                  np.array(done_n) #done
                                  )

                    # terminate the collection of data if the controller shows stability
                    # for a long time. This is a good thing.
                    if steps > maximum_number_of_steps:
                        done_n[:] = True

                if viewer:
                    viewer.close()

                if network.buffer.games_played >= 1:
                    break

            monitor.make_gifs(iteration)
            
            
            for count, writer in enumerate(summary_writers):
                if count < (len(summary_writers) - 1):
                    summary = tf.Summary()
                    summary.value.add(tag='Average Reward', simple_value=(total_reward[count]))
                    summary.value.add(tag='Steps Taken', simple_value=(steps))
                    writer.add_summary(summary, iteration)
                writer.flush()
class DQNLinearAgent(BaseAgent):
    def __init__(self, game, sess, nb_actions, global_step):
        BaseAgent.__init__(self, game, sess, nb_actions, global_step)
        self.name = "DQN_linear_agent"
        self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm)
        self.nb_action = nb_actions
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.episode_max_values = []
        self.episode_min_values = []
        self.episode_mean_returns = []
        self.episode_max_returns = []
        self.episode_min_returns = []
        self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob,
                                          FLAGS.initial_random_action_prob)
        self.summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir, FLAGS.algorithm))
        self.summary = tf.Summary()
        self.nb_states = game.nb_states
        self.q_net = DQLinearNetwork(nb_actions, self.nb_states, 'orig')
        self.target_net = DQLinearNetwork(nb_actions, self.nb_states, 'target')

        self.targetOps = self.update_target_graph('orig', 'target')

        self.probability_of_random_action = self.exploration.value(0)

    def train(self):
        minibatch = random.sample(self.episode_buffer, FLAGS.batch_size)
        rollout = np.array(minibatch)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        done = rollout[:, 4]

        state_features = np.identity(self.nb_states)

        target_actionv_values_evaled = self.sess.run(self.target_net.action_values,
                                                     feed_dict={self.target_net.inputs: state_features[next_observations]})
        target_actionv_values_evaled_max = np.max(target_actionv_values_evaled, axis=1)

        target_actionv_values_evaled_new = []

        for i in range(FLAGS.batch_size):
            if done[i]:
                target_actionv_values_evaled_new.append(rewards[i])
            else:
                target_actionv_values_evaled_new.append(
                    rewards[i] + FLAGS.gamma * target_actionv_values_evaled_max[i])

        feed_dict = {self.q_net.target_q: target_actionv_values_evaled_new,
                     self.q_net.inputs: state_features[observations],
                     self.q_net.actions: actions}
        l, _, ms, returns = self.sess.run(
            [self.q_net.action_value_loss,
             self.q_net.train_op,
             self.q_net.merged_summary,
             self.q_net.action_values],
            feed_dict=feed_dict)

        # self.updateTarget()

        return l / len(rollout), ms, returns

    def updateTarget(self):
        for op in self.targetOps:
            self.sess.run(op)

    def eval(self, saver):
        self.saver = saver
        total_steps = 0
        episode_rewards = []

        print("Starting eval agent")
        with self.sess.as_default(), self.graph.as_default():
            while total_steps < FLAGS.test_episodes:
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.get_initial_state()

                while not d:
                    a = self.policy_evaluation_eval(s)

                    s1, r, d, info = self.env.step(a)

                    r = np.clip(r, -1, 1)
                    episode_reward += r
                    episode_step_count += 1

                    s = s1
                print("Episode reward was {}".format(episode_reward))
                episode_rewards.append(episode_reward)
                total_steps += 1
        print("Mean reward is {}".format(np.mean(np.asarray(episode_rewards))))

    def play(self, saver):
        self.saver = saver
        train_stats = None

        # self.episode_count = self.sess.run(self.global_episode)
        self.total_steps = self.sess.run(self.global_step)
        if self.total_steps == 0:
            self.updateTarget()


        print("Starting agent")
        _t = {'episode': Timer(), "step": Timer()}
        with self.sess.as_default(), self.graph.as_default():
            while self.total_steps < FLAGS.max_total_steps:
                _t["episode"].tic()
                if self.total_steps % FLAGS.target_update_freq == 0:
                    self.updateTarget()
                episode_reward = 0
                episode_step_count = 0
                q_values = []

                d = False
                # self.probability_of_random_action = self.exploration.value(self.total_steps)
                s = self.env.get_initial_state()

                while not d:
                    _t["step"].tic()
                    a, max_action_values_evaled = self.policy_evaluation(s)

                    if max_action_values_evaled is not None:
                        q_values.append(max_action_values_evaled)

                    s1, r, d = self.env.step(a)

                    r = np.clip(r, -1, 1)
                    episode_reward += r
                    episode_step_count += 1
                    self.total_steps += 1
                    self.episode_buffer.append([s, a, r, s1, d])

                    s = s1

                    if len(self.episode_buffer) == FLAGS.memory_size:
                        self.episode_buffer.popleft()

                    if self.total_steps > FLAGS.observation_steps and len(
                            self.episode_buffer) > FLAGS.observation_steps and self.total_steps % FLAGS.update_freq == 0:
                        l, ms, returns = self.train()
                        train_stats = l, ms, returns

                    _t["step"].toc()

                    self.sess.run(self.increment_global_step)

                    if episode_step_count == 23:
                        d = True


                self.add_summary(episode_reward, episode_step_count, q_values, train_stats)


                _t["episode"].toc()

        print('Avg time per step is {:.3f}'.format(_t["step"].average_time()))
        print('Avg time per episode is {:.3f}'.format(_t["episode"].average_time()))

        # fps = self.total_steps / _t['Total'].duration
        # print('Average time per episod is {}'.format(_t['episode'].average_time))

    def add_summary(self, episode_reward, episode_step_count, q_values, train_stats):
        self.episode_rewards.append(episode_reward)
        self.episode_lengths.append(episode_step_count)
        if len(q_values):
            self.episode_mean_values.append(np.mean(np.asarray(q_values)))
            self.episode_max_values.append(np.max(np.asarray(q_values)))
            self.episode_min_values.append(np.min(np.asarray(q_values)))

        if self.total_steps % FLAGS.summary_interval == 0 and self.total_steps != 0 and self.total_steps > FLAGS.observation_steps:
            if self.total_steps % FLAGS.checkpoint_interval == 0:
                self.save_model(self.saver, self.total_steps)

            l, ms, returns = train_stats

            self.episode_mean_returns.append(np.mean(np.asarray(returns)))
            self.episode_max_returns.append(np.max(np.asarray(returns)))
            self.episode_min_returns.append(np.min(np.asarray(returns)))

            mean_reward = np.mean(self.episode_rewards[-FLAGS.summary_interval:])
            mean_length = np.mean(self.episode_lengths[-FLAGS.summary_interval:])
            mean_value = np.mean(self.episode_mean_values[-FLAGS.summary_interval:])
            max_value = np.mean(self.episode_max_values[-FLAGS.summary_interval:])
            min_value = np.mean(self.episode_min_values[-FLAGS.summary_interval:])

            mean_return = np.mean(self.episode_mean_returns[-FLAGS.summary_interval:])
            max_return = np.mean(self.episode_max_returns[-FLAGS.summary_interval:])
            min_return= np.mean(self.episode_min_returns[-FLAGS.summary_interval:])
            # if episode_count % FLAGS.test_performance_interval == 0:
            #     won_games = self.episode_rewards[-FLAGS.test_performance_interval:].count(1)
            #     self.summary.value.add(tag='Perf/Won Games/1000', simple_value=float(won_games))


            self.summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
            self.summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
            self.summary.value.add(tag='Perf/Value_Mean', simple_value=float(mean_value))
            self.summary.value.add(tag='Perf/Value_Max', simple_value=float(max_value))
            self.summary.value.add(tag='Perf/Value_Min', simple_value=float(min_value))
            self.summary.value.add(tag='Perf/Return_Mean', simple_value=float(mean_return))
            self.summary.value.add(tag='Perf/Return_Max', simple_value=float(max_return))
            self.summary.value.add(tag='Perf/Return_Min', simple_value=float(min_return))
            self.summary.value.add(tag='Perf/Probability_random_action', simple_value=float(self.probability_of_random_action))
            self.summary.value.add(tag='Losses/Loss', simple_value=float(l))

            self.write_summary(ms, None)

    def policy_evaluation(self, s):
        action_values_evaled = None
        self.probability_of_random_action = self.exploration.value(self.total_steps)
        if random.random() <= self.probability_of_random_action:
            a = np.random.choice(range(self.nb_actions))
        else:
            state_features = np.identity(self.nb_states)
            feed_dict = {self.q_net.inputs: state_features[s:s+1]}
            action_values_evaled = self.sess.run(self.q_net.action_values, feed_dict=feed_dict)[0]

            a = np.argmax(action_values_evaled)

        return a, np.max(action_values_evaled)

    def policy_evaluation_eval(self, s):
        feed_dict = {self.q_net.inputs: [s]}
        action_values_evaled = self.sess.run(self.q_net.action_values, feed_dict=feed_dict)[0]

        a = np.argmax(action_values_evaled)

        return a
class SFAgent(BaseAgent):
    def __init__(self, game, sess, nb_actions, global_step):
        BaseAgent.__init__(self, game, sess, nb_actions, global_step)
        self.name = "SF_agent"
        self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm)

        self.nb_states = self.env.nb_states
        if FLAGS.matrix_type == "incidence":
            self.sf_buffer = np.zeros(
                [self.nb_states * self.nb_states, self.nb_states])
        else:
            self.sf_buffer = np.zeros([self.nb_states, self.nb_states])
        self.seen_states = set()
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.episode_max_values = []
        self.episode_min_values = []
        self.episode_mean_returns = []
        self.episode_max_returns = []
        self.episode_min_returns = []
        self.exploration = LinearSchedule(FLAGS.explore_steps,
                                          FLAGS.final_random_action_prob,
                                          FLAGS.initial_random_action_prob)
        self.summary_writer = tf.summary.FileWriter(
            os.path.join(FLAGS.summaries_dir, FLAGS.algorithm))
        self.summary = tf.Summary()

        self.sf_table = np.zeros([self.nb_states, self.nb_states])

        # self.q_net = SFNetwork(self.nb_actions, self.nb_states, 'orig')
        # self.target_net = SFNetwork(self.nb_actions, self.nb_states, 'target')
        #
        # self.targetOps = self.update_target_graph('orig', 'target')
        #
        self.probability_of_random_action = self.exploration.value(0)

    def train(self):
        minibatch = random.sample(self.episode_buffer, FLAGS.batch_size)
        rollout = np.array(minibatch)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        done = rollout[:, 4]

        state_features = np.identity(self.nb_states)

        target_sf_evaled = self.sess.run(self.target_net.sf,
                                         feed_dict={
                                             self.target_net.features:
                                             state_features[next_observations]
                                         })
        # target_sf_evaled_exp = np.mean(target_sf_evaled, axis=1)

        # gamma = np.tile(np.expand_dims(np.asarray(np.logical_not(done), dtype=np.int32) * FLAGS.gamma, 1),
        #                 [1, self.nb_states])
        #
        # target_sf_evaled_new = state_features[next_observations] + gamma * target_sf_evaled_exp
        #
        feed_dict = {
            self.q_net.target_sf:
            target_sf_evaled,
            # self.q_net.target_reward: np.stack(rewards, axis=0),
            self.q_net.features:
            state_features[observations]
        }
        # self.q_net.actions: actions}
        sf_l, _, ms = self.sess.run(
            [
                self.q_net.sf_loss,
                # self.q_net.reward_loss,
                # self.q_net.total_loss,
                self.q_net.train_op,
                self.q_net.merged_summary
            ],
            feed_dict=feed_dict)

        # self.updateTarget()

        return sf_l / len(rollout), ms

    def updateTarget(self):
        for op in self.targetOps:
            self.sess.run(op)

    def eval(self, saver):
        self.saver = saver
        total_steps = 0
        episode_rewards = []

        print("Starting eval agent")
        with self.sess.as_default(), self.graph.as_default():
            while total_steps < FLAGS.test_episodes:
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.get_initial_state()

                while not d:
                    a = self.policy_evaluation_eval(s)

                    s1, r, d, info = self.env.step(a)

                    r = np.clip(r, -1, 1)
                    episode_reward += r
                    episode_step_count += 1

                    s = s1
                print("Episode reward was {}".format(episode_reward))
                episode_rewards.append(episode_reward)
                total_steps += 1
        print("Mean reward is {}".format(np.mean(np.asarray(episode_rewards))))

    def play(self, saver):
        self.saver = saver
        train_stats = None
        d = True
        # self.episode_count = self.sess.run(self.global_episode)
        self.total_steps = self.sess.run(self.global_step)
        # if self.total_steps == 0:
        #     self.updateTarget()
        self.nb_episodes = 0
        state_features = np.identity(self.nb_states)
        episode_reward = 0
        episode_step_count = 0
        q_values = []
        td_error = 0
        print("Starting agent")
        _t = {'episode': Timer(), "step": Timer()}
        with self.sess.as_default(), self.graph.as_default():
            while self.total_steps < FLAGS.max_total_steps:

                if self.total_steps == 0 or d or episode_step_count % 30 == 0:
                    _t["episode"].tic()
                    # if self.total_steps % FLAGS.target_update_freq == 0:
                    #     self.updateTarget()
                    self.add_summary(episode_reward, episode_step_count,
                                     q_values, train_stats)
                    episode_reward = 0
                    episode_step_count = 0
                    q_values = []
                    if self.total_steps != 0:
                        self.nb_episodes += 1
                    d = False
                    # self.probability_of_random_action = self.exploration.value(self.total_steps)
                    s = self.env.get_initial_state()

                _t["step"].tic()
                a, max_action_values_evaled = self.policy_evaluation(s)

                # if max_action_values_evaled is None:
                #     q_values.append(0)
                # else:
                #     q_values.append(max_action_values_evaled)

                s1, r, d = self.env.step(a)
                self.env.render()

                r = np.clip(r, -1, 1)
                episode_reward += r
                episode_step_count += 1
                self.total_steps += 1
                td_error = (state_features[s] +
                            FLAGS.gamma * self.sf_table[s1]) - self.sf_table[s]
                q_values.append(td_error)
                # print(sum(td_error))
                # self.episode_buffer.append([s, a, r, s1, d])
                self.sf_table[s] = self.sf_table[s] + FLAGS.lr * td_error

                s = s1

                # if len(self.episode_buffer) == FLAGS.memory_size:
                #     self.episode_buffer.popleft()

                # if self.total_steps > FLAGS.observation_steps and len(
                #         self.episode_buffer) > FLAGS.observation_steps and self.total_steps % FLAGS.update_freq == 0:# and FLAGS.task != "discover":
                #     sf_l, ms = self.train()
                #     train_stats = sf_l, ms

                if self.total_steps > FLAGS.nb_steps_sf:
                    s, v = self.discover_options()
                    # self.sf_buffer.popleft()

                # if self.total_steps > FLAGS.nb_steps_sf:

                # if FLAGS.matrix_type == "incidence":
                #     self.construct_incidence_matrix()
                # else:
                #     self.construct_successive_matrix()
                # self.add_successive_feature(s, a)

                _t["step"].toc()

                self.sess.run(self.increment_global_step)

            _t["episode"].toc()
            # print('Avg time per step is {:.3f}'.format(_t["step"].average_time()))
            # print('Avg time per episode is {:.3f}'.format(_t["episode"].average_time()))

            # fps = self.total_steps / _t['Total'].duration
            # print('Average time per episod is {}'.format(_t['episode'].average_time))

    def construct_successive_matrix(self):
        for s in range(self.nb_states):
            state_features = np.identity(self.nb_states)
            sf_feat = self.sess.run(
                self.q_net.sf,
                feed_dict={self.q_net.features: state_features[s:s + 1]})
            # a = np.random.choice(range(self.nb_actions))
            # a_one_hot = np.zeros(shape=(1, self.nb_actions, self.nb_states), dtype=np.int32)
            # a_one_hot[0, a] = 1
            # sf_feat_a = np.sum(np.multiply(sf_feat, a_one_hot), axis=1)
            self.sf_buffer[s] = sf_feat
            if s not in self.seen_states:
                self.seen_states.add(s)

    def construct_incidence_matrix(self):
        i = 0
        for s in range(self.nb_states):
            for s1 in range(self.nb_states):
                state_features = np.identity(self.nb_states)

                sf_feat = self.sess.run(
                    self.q_net.sf,
                    feed_dict={self.q_net.features: state_features[s:s + 1]})
                sf_feat1 = self.sess.run(
                    self.q_net.sf,
                    feed_dict={self.q_net.features: state_features[s1:s1 + 1]})

                trans = state_features[s1:s1 + 1] - state_features[s:s + 1]

                self.sf_buffer[i] = trans
                i += 1
            if s not in self.seen_states:
                self.seen_states.add(s)

    # def add_successive_feature(self, s, a):
    #     state_features = np.identity(self.nb_states)
    #     sf_feat = self.sess.run(self.q_net.sf,
    #                                      feed_dict={self.q_net.features: state_features[s:s+1]})
    #     a_one_hot = np.zeros(shape=(1, self.nb_actions, self.nb_states), dtype=np.int32)
    #     a_one_hot[0, a] = 1
    #     sf_feat_a = np.sum(np.multiply(sf_feat, a_one_hot), axis=1)
    #     if s not in self.seen_states:
    #         self.seen_states.add(s)
    #     self.sf_buffer[s] = sf_feat_a

    def discover_options(self):
        sf_matrix = tf.convert_to_tensor(np.squeeze(np.array(self.sf_table)),
                                         dtype=tf.float32)
        s, u, v = tf.svd(sf_matrix)

        # discard noise, get first 10
        # s = s[:10]
        # v = v[:10]

        if FLAGS.task == "discover":
            # Plotting all the basis
            plot = Visualizer(self.env)
            s_evaled, v_evaled = self.sess.run([s, v])
            idx = s_evaled.argsort()[::-1]
            s_evaled = s_evaled[idx]
            v_evaled = v_evaled[:, idx]
            plot.plotBasisFunctions(s_evaled, v_evaled)

            guard = len(s_evaled)
            epsilon = 0
            options = []
            actionSetPerOption = []
            for i in range(guard):
                idx = guard - i - 1
                print('Solving for eigenvector #' + str(idx))
                polIter = PolicyIteration(0.9, self.env, augmentActionSet=True)
                self.env.define_reward_function(v_evaled[:, idx])
                V, pi = polIter.solvePolicyIteration()

                # Now I will eliminate any actions that may give us a small improvement.
                # This is where the epsilon parameter is important. If it is not set all
                # it will never be considered, since I set it to a very small value
                for j in range(len(V)):
                    if V[j] < epsilon:
                        pi[j] = len(self.env.get_action_set())

                # if plotGraphs:
                plot.plotValueFunction(V[0:self.nb_states], str(idx) + '_')
                plot.plotPolicy(pi[0:self.nb_states], str(idx) + '_')

                options.append(pi[0:self.nb_states])
                optionsActionSet = self.env.get_action_set()
                np.append(optionsActionSet, ['terminate'])
                actionSetPerOption.append(optionsActionSet)

        exit(0)
        return s, v

    def add_summary(self, episode_reward, episode_step_count, q_values,
                    train_stats):
        self.episode_rewards.append(episode_reward)
        self.episode_lengths.append(episode_step_count)
        if len(q_values):
            self.episode_mean_values.append(np.mean(np.asarray(q_values)))
            self.episode_max_values.append(np.max(np.asarray(q_values)))
            self.episode_min_values.append(np.min(np.asarray(q_values)))

        if self.nb_episodes % FLAGS.summary_interval == 0 and self.nb_episodes != 0 and self.total_steps > FLAGS.observation_steps:
            if self.nb_episodes % FLAGS.checkpoint_interval == 0:
                self.save_model(self.saver, self.total_steps)

            mean_reward = np.mean(
                self.episode_rewards[-FLAGS.summary_interval:])
            mean_length = np.mean(
                self.episode_lengths[-FLAGS.summary_interval:])
            mean_value = np.mean(
                self.episode_mean_values[-FLAGS.summary_interval:])
            max_value = np.mean(
                self.episode_max_values[-FLAGS.summary_interval:])
            min_value = np.mean(
                self.episode_min_values[-FLAGS.summary_interval:])

            self.summary.value.add(tag='Perf/Reward',
                                   simple_value=float(mean_reward))
            self.summary.value.add(tag='Perf/Length',
                                   simple_value=float(mean_length))
            self.summary.value.add(tag='Perf/Value_Mean',
                                   simple_value=float(mean_value))
            self.summary.value.add(tag='Perf/Value_Max',
                                   simple_value=float(max_value))
            self.summary.value.add(tag='Perf/Value_Min',
                                   simple_value=float(min_value))
            self.summary.value.add(tag='Perf/Probability_random_action',
                                   simple_value=float(
                                       self.probability_of_random_action))
            # if train_stats is not None:
            #     sf_l, ms = train_stats
            # self.summary.value.add(tag='Losses/SF_Loss', simple_value=float(sf_l))
            # self.summary.value.add(tag='Losses/R_Loss', simple_value=float(r_l))
            # self.summary.value.add(tag='Losses/T_Loss', simple_value=float(t_l))

            self.write_summary(None)

    def policy_evaluation(self, s):
        action_values_evaled = None
        self.probability_of_random_action = self.exploration.value(
            self.total_steps)
        # if random.random() <= self.probability_of_random_action:
        a = np.random.choice(range(self.nb_actions))
        # else:
        #     state_features = np.identity(self.nb_states)
        #     feed_dict = {self.q_net.features: state_features[s:s+1]}
        #     action_values_evaled = self.sess.run(self.q_net.q, feed_dict=feed_dict)[0]
        #
        #     a = np.argmax(action_values_evaled)

        return a, 0

    def policy_evaluation_eval(self, s):
        feed_dict = {self.q_net.inputs: [s]}
        action_values_evaled = self.sess.run(self.q_net.action_values,
                                             feed_dict=feed_dict)[0]

        a = np.argmax(action_values_evaled)

        return a
Exemple #4
0
def main(_):
    # create visualizer
    #visualizer = TensorboardVisualizer()
    monitor = Monitor(FLAGS)
    #log_dir = monitor.log_dir
    #visualizer.initialize(log_dir, None)
    saved_mean_reward = None
    # openAI logger
    L.configure(monitor.log_dir, format_strs=['stdout', 'csv'])

    # initialize env
    atari_env = AtariEnv(monitor)
    #screen_shot_subgoal(atari_env)

    # we should probably follow deepmind style env
    # stack 4 frames and scale float
    env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True)

    # get default tf_session
    sess = U.get_session()

    # create q networks for controller
    controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller')
    controller = Controller(controller_network, env.action_space.n)

    # create q networks for meta-controller
    num_goals = env.unwrapped.goals_space.n
    metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller')
    metacontroller = MetaController(metacontroller_network, num_goals)
    # Create the schedule for exploration starting from 1.
    exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps),
                                 initial_p=1.0,
                                 final_p=EXPLORATION_FINAL_EPS)
    # initialize experience replay
    controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE)
    metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE)
    
    # initialize critic
    critic = Critic(env.unwrapped)

    total_extrinsic_reward = []
    # for success rate
    total_goal_reached = np.zeros(num_goals, dtype=np.int32) 
    total_goal_sampled = np.zeros(num_goals, dtype=np.int32)
    total_goal_epsilon = np.ones(num_goals, dtype=np.float32)
    ep = 0
    total_step = 0
    init_ob = env.reset()

    U.initialize()
    # initialize target network in both controller and meta
    sess.run(metacontroller.network.update_target_op)
    sess.run(controller.network.update_target_op)

    # load ckpt if presence 
    model_path = tf.train.latest_checkpoint(monitor.ckpt_dir)
    model_saved = False
    model_file = os.path.join(monitor.ckpt_dir, 'model')
    if model_path is not None:
        U.load_variables(model_file)
        L.log('loaded model from %s' % model_file)
        model_saved = True

    while ep < MAX_EPISODE: # count number of steps 
        # init environment game play variables
        
        init_ob = env.reset()
        observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape)
        desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0]
        env.unwrapped.desired_goal = desired_goal
        total_goal_sampled[desired_goal] += 1

        # given predicted goal, we encode this goal bounding mask to the observation np array
        ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal)

        # NOTE: Below code verify added mask correctly
        # for i in range(ob_with_g.shape[-1]):
        #     ob = ob_with_g[:,:,i]
        #     image = Image.fromarray(ob)
        #     image = image.convert('RGB')
        #     image.save('test_%i.png' % i)

        done = False
        reached_goal = False

        while not done:
            extrinsic_rewards = 0
            s0 = init_ob['observation']

            while not (done or reached_goal):
                update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0]
                # obtain extrinsic reward from environment
                ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t)
                reached_goal = env.unwrapped.reached_goal(desired_goal)
                ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal)
                
                intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t)
                controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t)
                
                # sample from replay_buffer1 to train controller
                obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None
                # get q estimate for tp1 as 'supervised'
                ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape)
                q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0]
                td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1)
                # join train meta-controller only sample from replay_buffer2 to train meta-controller
                if total_step >= WARMUP_STEPS:
                    L.log('join train has started ----- step %d', total_step)
                    # sample from replay_buffer2 to train meta-controller
                    init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                    weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None
                    # get q estimate for tp1 as 'supervised'
                    obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape)
                    q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0]
                    td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1)

                if total_step % UPDATE_TARGET_NETWORK_FREQ == 0:
                    #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step)
                    sess.run(controller.network.update_target_op)
                    # its fine, we aren't really training meta dqn until after certain steps.
                    sess.run(metacontroller.network.update_target_op)

                extrinsic_rewards += extrinsic_reward_t
                ob_with_g = ob_with_g_tp1
                done = done_t
                total_step += 1
            # we are done / reached_goal
            # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2
            # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards))
            # clean observation without goal encoded
            metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done)

            # if we are here then we have finished the desired goal
            if not done:
                #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards))
                exploration_ep = 1.0
                total_goal_reached[env.unwrapped.achieved_goal] += 1
                if total_step >= WARMUP_STEPS:
                    t = total_step - WARMUP_STEPS
                    exploration_ep = exploration2.value(t)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                
                while env.unwrapped.achieved_goal == desired_goal:
                    desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0]

                env.unwrapped.desired_goal = desired_goal
                total_goal_sampled[desired_goal] += 1
                L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal))

                # start again
                reached_goal = False
        
        # finish an episode
        total_extrinsic_reward.append(extrinsic_rewards)
        ep += 1

        mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1)
        if ep % monitor.print_freq == 0 :
            L.record_tabular("steps", total_step)
            L.record_tabular("episodes", ep)
            L.record_tabular("mean 100 episode reward", mean_100ep_reward)
            L.dump_tabular()

        if total_step % monitor.ckpt_freq == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
            U.save_variables(model_file)
            model_saved = True
            saved_mean_reward = mean_100ep_reward
    
    # verified our model was saved
    if model_saved:
        L.log('restored model with mean reward: %d' % saved_mean_reward)
        U.load_variables(model_file)
Exemple #5
0
def train_DQNs (sess, DQNs, spec_params, tester, curriculum, show_print, render):
	# Initializing parameters
	dqns = DQNs[spec_params.ltl_spec]
	training_params = tester.training_params
	testing_params = tester.testing_params

	env = Game(spec_params)
	obs_proxy = Obs_Proxy(env)
	agents = env.agents
	action_set = env.get_actions(agents[0]) # NOTE: only if they all have the same action set
	# All the agents have the same observation
	num_features = len(obs_proxy.get_observation(env, env.agents[0]))
	max_steps = training_params.max_timesteps_per_spec
	Replay_buffers = {}
	for agent in agents:
		Replay_buffers[str(agent)] = IDQNReplayBuffer(
			training_params.replay_size)
	exploration = LinearSchedule(
		schedule_timesteps = int(training_params.exploration_frac \
			* max_steps), initial_p=1.0, 
		final_p = training_params.final_exploration)

	training_reward = 0
	last_ep_rew = 0	
	episode_count = 0  # episode counter
	rew_batch = np.zeros(100)

	if show_print: print("Executing ", max_steps, " steps...")
	if render: env.show_map()

	#We start iterating with the environment
	for t in range (max_steps):
		actions = []
		for agent, dqn in zip(agents.values(), dqns.values()):
			# Getting the current state and ltl goal
			s1 = obs_proxy.get_observation(env, agent) 

			# Choosing an action to perform
			if random.random() < exploration.value(t): 
				act = random.choice(action_set) # take random actions
			else: 
				act = Actions(dqn.get_best_action(s1.reshape((1,num_features))))
				# print("Act", act)
			actions.append(act)
			dqn.add_step()
		# updating the curriculum
		curriculum.add_step()

		# Executing the action
		reward = env.execute_actions(actions)
		if render and ep_c%30 is 0:
			time.sleep(0.01)
			clear_screen()
			env.show_map()

		training_reward += reward

		for agent, dqn, act in zip(agents.values(), dqns.values(),
									actions):
			# Saving this transition
			s2 = obs_proxy.get_observation(env, agent) # adding the DFA state to the features
			done = env.ltl_game_over or env.env_game_over
			dqn.save_transition(s1, act, reward, s2, done)

			# Learning
			if dqn.get_steps() > training_params.learning_starts and \
				dqn.get_steps() % training_params.values_network_update_freq \
				== 0:
				dqn.learn()

			# Updating the target network
			if dqn.get_steps() > training_params.learning_starts and \
				dqn.get_steps() % training_params.target_network_update_freq\
				== 0: dqn.update_target_network()

		# Printing
		if show_print and (dqns['0'].get_steps()+1) \
							% training_params.print_freq == 0:
			print("Step:", dqns['0'].get_steps()+1, "\tTotal reward:",
				last_ep_rew, "\tSucc rate:",
				"%0.3f"%curriculum.get_succ_rate(), 
				"\tNumber of episodes:", episode_count)	

		# Testing
		if testing_params.test and (curriculum.get_current_step() \
				% testing_params.test_freq == 0):
					tester.run_test(curriculum.get_current_step(),
						sess, _test_DQN, DQNs)

		# Restarting the environment (Game Over)
		if done:
			# Game over occurs for one of three reasons: 
			# 1) DFA reached a terminal state, 
			# 2) DFA reached a deadend, or 
			# 3) The agent reached an environment deadend (e.g. a PIT)
			# Restarting
			env = Game(spec_params) 
			obs_proxy = Obs_Proxy(env)
			agents = env.agents
			rew_batch[episode_count%100]= training_reward
			episode_count+=1
			last_ep_rew = training_reward
			training_reward = 0

			# updating the hit rates
			curriculum.update_succ_rate(t, reward)
			# Uncomment if want to stop learning according to succ. rate
			# if curriculum.stop_spec(t):
			# 	last_ep_rew = 0
			# 	if show_print: print("STOP SPEC!!!")
			# 	break
		
		# checking the steps time-out
		if curriculum.stop_learning():
			if show_print: print("STOP LEARNING!!!")
			break

	if show_print: 
		print("Done! Last reward:", last_ep_rew)
Exemple #6
0
def fit(
        env,
        q_func,
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=100,
        checkpoint_freq=10000,
        checkpoint_path=None,
        learning_starts=1000,
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        callback=None
):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration
        rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version
        is restored at the end of the training. If you do not wish to
        restore the best version at the end of the training set this
        variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before
        learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from
        initial value to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load
        it.  See header of baselines/deepq/categorical.py for details
        on the act function.
    """
    # Create all the functions necessary to train the model

    model = DeepDQN()
    sess = model.init_session().__enter__()

    # capture the shape outside the closure so that the env object is
    # not serialized by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = model.build_train(
        make_obs_ph,
        q_func,
        env.action_space.n,
        tf.train.AdamOptimizer(learning_rate=lr),
        10,
        gamma,
        param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,
            final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial_p=1.0,
        final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    model.init_vars()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            model.load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence
                # between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with
                # eps = exploration.value(t).  See Appendix C.1 in
                # Parameter Space Noise for Exploration, Plappert et
                # al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = \
                    update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(
                np.array(obs)[None], update_eps=update_eps, **kwargs
            )[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t)
                    )
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = \
                        replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(
                    obses_t,
                    actions,
                    rewards,
                    obses_tp1,
                    dones,
                    weights
                )
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(
                        batch_idxes,
                        new_priorities
                    )

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}".
                            format(saved_mean_reward, mean_100ep_reward)
                        )
                    model.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward)
                )
            model.load_state(model_file)

    return act
Exemple #7
0
class CategoricalDQNAgent(BaseAgent):
    def __init__(self, game, sess, nb_actions, global_step):
        BaseAgent.__init__(self, game, sess, nb_actions, global_step)
        self.name = "CategoricalDQN_agent"
        self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.algorithm)
        self.support = np.linspace(FLAGS.v_min, FLAGS.v_max, FLAGS.nb_atoms)
        self.delta_z = (FLAGS.v_max - FLAGS.v_min) / (FLAGS.nb_atoms - 1)

        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.episode_max_values = []
        self.episode_min_values = []
        self.episode_mean_returns = []
        self.episode_max_returns = []
        self.episode_min_returns = []
        self.exploration = LinearSchedule(FLAGS.explore_steps, FLAGS.final_random_action_prob,
                                          FLAGS.initial_random_action_prob)
        self.summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir, FLAGS.algorithm))
        self.summary = tf.Summary()

        self.q_net = CategoricalDQNetwork(nb_actions, 'orig')
        self.target_net = CategoricalDQNetwork(nb_actions, 'target')

        self.targetOps = self.update_target_graph('orig', 'target')

        self.probability_of_random_action = self.exploration.value(0)

    def train(self):
        minibatch = random.sample(self.episode_buffer, FLAGS.batch_size)
        rollout = np.array(minibatch)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        done = rollout[:, 4]

        # Compute target distribution of Q(s_,a)
        target_probs_reprojected = self.get_target_distribution(rewards, done, next_observations, observations, actions)

        feed_dict = {self.q_net.target_q: target_probs_reprojected,
                     self.q_net.inputs: np.stack(observations, axis=0),
                     self.q_net.actions: actions}
        l, _, ms, img_summ, q, q_distrib = self.sess.run(
            [self.q_net.action_value_loss,
             self.q_net.train_op,
             self.q_net.merged_summary,
             self.q_net.image_summaries,
             self.q_net.action_value,
             self.q_net.action_values_soft],
            feed_dict=feed_dict)

        # self.updateTarget()

        return l / len(rollout), ms, img_summ

    def updateTarget(self):
        for op in self.targetOps:
            self.sess.run(op)

    def eval(self, saver):
        self.saver = saver
        total_steps = 0
        episode_rewards = []

        print("Starting eval agent")
        with self.sess.as_default(), self.graph.as_default():
            while total_steps < FLAGS.test_episodes:
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.get_initial_state()

                while not d:
                    a = self.policy_evaluation_eval(s)

                    s1, r, d, info = self.env.step(a)

                    r = np.clip(r, -1, 1)
                    episode_reward += r
                    episode_step_count += 1

                    s = s1
                print("Episode reward was {}".format(episode_reward))
                episode_rewards.append(episode_reward)
                total_steps += 1
        print("Mean reward is {}".format(np.mean(np.asarray(episode_rewards))))

    def play(self, saver):
        self.saver = saver
        train_stats = None

        # self.episode_count = self.sess.run(self.global_episode)
        self.total_steps = self.sess.run(self.global_step)
        if self.total_steps == 0:
            self.updateTarget()


        print("Starting agent")
        _t = {'episode': Timer(), "step": Timer()}
        with self.sess.as_default(), self.graph.as_default():
            while self.total_steps < FLAGS.max_total_steps:
                _t["episode"].tic()
                if self.total_steps % FLAGS.target_update_freq == 0:
                    self.updateTarget()
                episode_reward = 0
                episode_step_count = 0
                q_values = []
                d = False
                # self.probability_of_random_action = self.exploration.value(self.total_steps)
                s = self.env.get_initial_state()

                while not d:
                    _t["step"].tic()
                    a, max_action_values_evaled = self.policy_evaluation(s)

                    if max_action_values_evaled is None:
                        q_values.append(0)
                    else:
                        q_values.append(max_action_values_evaled)

                    s1, r, d, info = self.env.step(a)

                    r = np.clip(r, -1, 1)
                    episode_reward += r
                    episode_step_count += 1
                    self.total_steps += 1
                    self.episode_buffer.append([s, a, r, s1, d])

                    s = s1

                    if len(self.episode_buffer) == FLAGS.memory_size:
                        self.episode_buffer.popleft()

                    if self.total_steps > FLAGS.observation_steps and len(
                            self.episode_buffer) > FLAGS.observation_steps and self.total_steps % FLAGS.update_freq == 0:
                        l, ms, img_summ = self.train()
                        train_stats = l, ms, img_summ

                    _t["step"].toc()
                    self.sess.run(self.increment_global_step)


                self.add_summary(episode_reward, episode_step_count, q_values, train_stats)

                if self.total_steps % FLAGS.eval_interval == 0:
                    self.evaluate_episode()

                # self.sess.run(self.increment_global_episode)

                _t["episode"].toc()

        print('Avg time per step is {:.3f}'.format(_t["step"].average_time()))
        print('Avg time per episode is {:.3f}'.format(_t["episode"].average_time()))

        # fps = self.total_steps / _t['Total'].duration
        # print('Average time per episod is {}'.format(_t['episode'].average_time))

    def add_summary(self, episode_reward, episode_step_count, q_values, train_stats):
        self.episode_rewards.append(episode_reward)
        self.episode_lengths.append(episode_step_count)
        if len(q_values):
            self.episode_mean_values.append(np.mean(np.asarray(q_values)))
            self.episode_max_values.append(np.max(np.asarray(q_values)))
            self.episode_min_values.append(np.min(np.asarray(q_values)))

        if self.total_steps % FLAGS.summary_interval == 0 and self.total_steps != 0 and self.total_steps > FLAGS.observation_steps:
            if self.total_steps % FLAGS.checkpoint_interval == 0:
                self.save_model(self.saver, self.total_steps)

            l, ms, img_summ = train_stats

            # self.episode_mean_returns.append(np.mean(np.asarray(returns)))
            # self.episode_max_returns.append(np.max(np.asarray(returns)))
            # self.episode_min_returns.append(np.min(np.asarray(returns)))

            mean_reward = np.mean(self.episode_rewards[-FLAGS.summary_interval:])
            mean_length = np.mean(self.episode_lengths[-FLAGS.summary_interval:])
            mean_value = np.mean(self.episode_mean_values[-FLAGS.summary_interval:])
            max_value = np.mean(self.episode_max_values[-FLAGS.summary_interval:])
            min_value = np.mean(self.episode_min_values[-FLAGS.summary_interval:])

            # if episode_count % FLAGS.test_performance_interval == 0:
            #     won_games = self.episode_rewards[-FLAGS.test_performance_interval:].count(1)
            #     self.summary.value.add(tag='Perf/Won Games/1000', simple_value=float(won_games))


            self.summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
            self.summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
            self.summary.value.add(tag='Perf/Value_Mean', simple_value=float(mean_value))
            self.summary.value.add(tag='Perf/Value_Max', simple_value=float(max_value))
            self.summary.value.add(tag='Perf/Value_Min', simple_value=float(min_value))
            # self.summary.value.add(tag='Perf/Return_Mean', simple_value=float(mean_return))
            # self.summary.value.add(tag='Perf/Return_Max', simple_value=float(max_return))
            # self.summary.value.add(tag='Perf/Return_Min', simple_value=float(min_return))
            self.summary.value.add(tag='Perf/Probability_random_action',
                                   simple_value=float(self.probability_of_random_action))
            self.summary.value.add(tag='Losses/Loss', simple_value=float(l))

            self.write_summary(ms, img_summ)

    def policy_evaluation(self, s):
        q = None
        self.probability_of_random_action = self.exploration.value(self.total_steps)

        if random.random() <= self.probability_of_random_action:
            a = np.random.choice(range(len(self.env.gym_actions)))
        else:
            feed_dict = {self.q_net.inputs: [s]}
            probs = self.sess.run(self.q_net.action_values_soft, feed_dict=feed_dict)[0]

            q = np.sum(
                np.multiply(probs, np.tile(np.expand_dims(self.support, 0), [self.nb_actions, 1])), 1)
            a = np.argmax(q)
            # a_one_hot = np.zeros(shape=(self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32)
            # a_one_hot[a] = 1
            # p_a_star = np.sum(np.multiply(probs, a_one_hot), 0)

            # import matplotlib.pyplot as plt
            # ax = plt.subplot(111)
            # p1 = ax.step(self.support, p_a_star, color='blue')
            # # p2 = ax.step(skewed_support[0], p_a_star[0], color='magenta')
            # # p3 = ax.step(bellman[0], p_a_star[0], color='green')
            # # p4 = ax.step(self.support, m[0], color='red')
            # ax.autoscale(tight=True)
            #
            # plt.show()

        return a, np.max(q)

    def policy_evaluation_eval(self, s):
        feed_dict = {self.q_net.inputs: [s]}
        action_values_evaled = self.sess.run(self.q_net.action_values_soft, feed_dict=feed_dict)[0]

        action_values_q = np.sum(
            np.multiply(action_values_evaled, np.tile(np.expand_dims(self.support, 0), [self.nb_actions, 1])), 1)
        a = np.argmax(action_values_q)
        a_one_hot = np.zeros(shape=(self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32)
        a_one_hot[a] = 1
        p_a_star = np.sum(np.multiply(action_values_evaled, a_one_hot), 0)

        # import matplotlib.pyplot as plt
        # ax = plt.subplot(111)
        # p1 = ax.step(self.support, p_a_star, color='blue')
        # # p2 = ax.step(skewed_support[0], p_a_star[0], color='magenta')
        # # p3 = ax.step(bellman[0], p_a_star[0], color='green')
        # # p4 = ax.step(self.support, m[0], color='red')
        # ax.autoscale(tight=True)
        #
        # plt.show()

        return a


    def get_target_distribution(self, rewards, done, next_observations, observations, actions):
        target_probs, probs = self.sess.run([self.target_net.action_values_soft, self.q_net.action_values_soft],
                                                     feed_dict={
                                                         self.target_net.inputs: np.stack(next_observations, axis=0),
                                                         self.q_net.inputs: np.stack(observations, axis=0)
                                                     })
        # a_one_hot = np.zeros(shape=(FLAGS.batch_size, self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32)
        # a_one_hot[np.arange(FLAGS.batch_size), np.asarray(actions, dtype=np.int32)] = 1
        # pt_a_star = np.sum(np.multiply(action_values_evaled, a_one_hot), axis=1)

        target_q = np.sum(
            target_probs * np.tile(np.expand_dims(np.expand_dims(self.support, 0), 0),
                                                   [FLAGS.batch_size, self.q_net.nb_actions, 1]), 2)

        target_a = np.argmax(target_q, axis=1)
        # target_a = np.tile(np.expand_dims(np.expand_dims(target_a, 1), 2), [1, 1, FLAGS.nb_atoms])
        target_a_one_hot = np.zeros(shape=(FLAGS.batch_size, self.q_net.nb_actions, FLAGS.nb_atoms), dtype=np.int32)
        # target_a_one_hot[np.arange(FLAGS.batch_size), target_a] = 1
        target_a_one_hot[np.arange(FLAGS.batch_size), target_a] = 1
        # target_a_one_hot = np.tile(np.expand_dims(target_a_one_hot, 2), [1, 1, FLAGS.nb_atoms])
        # a_one_hot = np.reshape(a_one_hot, (FLAGS.batch_size, self.q_net.nb_actions, FLAGS.nb_atoms))
        # p_a_star = np.squeeze(np.take(target_probs, target_a), 1)
        p_a_star = np.sum(np.multiply(target_probs, target_a_one_hot), axis=1)

        rewards = np.tile(np.expand_dims(np.asarray(rewards, dtype=np.float32), 1), [1, FLAGS.nb_atoms])
        gamma = np.tile(np.expand_dims(np.asarray(np.logical_not(done), dtype=np.int32) * FLAGS.gamma, 1),
                        [1, FLAGS.nb_atoms])
        # Compute projection of the application of the Bellman operator.
        skewed_support = gamma * np.tile(np.expand_dims(self.support, 0), [FLAGS.batch_size, 1])
        bellman = rewards + skewed_support
        bellman = np.clip(bellman, FLAGS.v_min, FLAGS.v_max)

        # Compute categorical indices for distributing the probability
        m = np.zeros(shape=(FLAGS.batch_size, FLAGS.nb_atoms))
        b = (bellman - FLAGS.v_min) / self.delta_z
        l = np.asarray(np.floor(b), dtype=np.int32)
        u = np.asarray(np.ceil(b), dtype=np.int32)

        # Distribute probability
        # for j in range(FLAGS.nb_atoms):
        #     m[:, l[:, j]] += target_actionv_values_evaled_max[:, j] * (u[:, j] - b[:, j])
        #     m[:, u[:, j]] += target_actionv_values_evaled_max[:, j] * (b[:, j] - l[:, j])

        for i in range(FLAGS.batch_size):
            for j in range(FLAGS.nb_atoms):
                uidx = u[i][j]
                lidx = l[i][j]
                m[i][lidx] = m[i][lidx] + p_a_star[i][j] * (uidx - b[i][j])
                m[i][uidx] = m[i][uidx] + p_a_star[i][j] * (b[i][j] - lidx)

        # if self.total_steps > FLAGS.explore_steps:
        #     import matplotlib.pyplot as plt
        #     ax = plt.subplot(111)
        #     # p1 = ax.step(self.support, p_a_star[0], color='blue')
        #     # p2 = ax.step(skewed_support[0], p_a_star[0], color='magenta')
        #     # p3 = ax.step(bellman[0], p_a_star[0], color='green')
        #     # p4 = ax.step(self.support, m[0], color='red')
        #     p4 = ax.step(self.support, pt_a_star[1], color='cyan')
        #     ax.autoscale(tight=True)
        #
        #     plt.show()
        return m

    def evaluate_episode(self):
        episode_reward = 0
        episode_step_count = 0
        d = False
        s = self.env.get_initial_state()

        while not d:
            a = self.policy_evaluation_eval(s)

            s1, r, d, info = self.env.step(a)

            r = np.clip(r, -1, 1)
            episode_reward += r
            episode_step_count += 1

            s = s1
        print("Episode reward was {}".format(episode_reward))
Exemple #8
0
def train():
    start_time = time.time()
    if seed is not None:
        set_seed(seed)
    # setup env
    envs = EnvWrapper(num_processes, simulator, env, env_config,
                      planner_config)

    # setup agent
    agent = createAgent()

    if load_model_pre:
        agent.loadModel(load_model_pre)
    agent.train()

    # logging
    simulator_str = copy.copy(simulator)
    if simulator == 'pybullet':
        simulator_str += ('_' + robot)
    log_dir = os.path.join(log_pre, '{}'.format(alg))
    if note:
        log_dir += '_'
        log_dir += note

    logger = Logger(log_dir, env, 'train', num_processes, max_episode, log_sub)
    hyper_parameters['model_shape'] = agent.getModelStr()
    logger.saveParameters(hyper_parameters)

    if buffer_type == 'expert':
        replay_buffer = QLearningBufferExpert(buffer_size)
    else:
        replay_buffer = QLearningBuffer(buffer_size)
    exploration = LinearSchedule(schedule_timesteps=explore,
                                 initial_p=init_eps,
                                 final_p=final_eps)

    states, in_hands, obs = envs.reset()

    if load_sub:
        logger.loadCheckPoint(os.path.join(log_dir, load_sub, 'checkpoint'),
                              envs, agent, replay_buffer)

    # pre train
    if load_buffer is not None and not load_sub:
        logger.loadBuffer(replay_buffer, load_buffer, load_n)
    if pre_train_step > 0:
        pbar = tqdm(total=pre_train_step)
        while len(logger.losses) < pre_train_step:
            t0 = time.time()
            train_step(agent, replay_buffer, logger)
            if logger.num_training_steps % 1000 == 0:
                logger.saveLossCurve(100)
                logger.saveTdErrorCurve(100)
            if not no_bar:
                pbar.set_description('loss: {:.3f}, time: {:.2f}'.format(
                    float(logger.getCurrentLoss()),
                    time.time() - t0))
                pbar.update(len(logger.losses) - pbar.n)

            if (time.time() - start_time) / 3600 > time_limit:
                logger.saveCheckPoint(args, envs, agent, replay_buffer)
                exit(0)
        pbar.close()
        logger.saveModel(0, 'pretrain', agent)
        # agent.sl = sl

    if not no_bar:
        pbar = tqdm(total=max_episode)
        pbar.set_description(
            'Episodes:0; Reward:0.0; Explore:0.0; Loss:0.0; Time:0.0')
    timer_start = time.time()

    obs = obs.permute(0, 3, 1, 2)
    in_hands = in_hands.permute(0, 3, 1, 2)

    while logger.num_episodes < max_episode:
        # add noise
        if perlin:
            addPerlinNoiseToObs(obs, perlin)
            addPerlinNoiseToInHand(in_hands, perlin)

        if fixed_eps:
            if logger.num_episodes < planner_episode:
                eps = 1
            else:
                eps = final_eps
        else:
            eps = exploration.value(logger.num_episodes)
        if planner_episode > logger.num_episodes:
            if np.random.random() < eps:
                is_expert = 1
                plan_actions = envs.getNextAction()
                actions_star_idx, actions_star = agent.getActionFromPlan(
                    plan_actions)
            else:
                is_expert = 0
                q_value_maps, actions_star_idx, actions_star = agent.getEGreedyActions(
                    states, in_hands, obs, final_eps)
        else:
            is_expert = 0
            q_value_maps, actions_star_idx, actions_star = agent.getEGreedyActions(
                states, in_hands, obs, eps)

        if alg.find('dagger') >= 0:
            plan_actions = envs.getNextAction()
            planner_actions_star_idx, planner_actions_star = agent.getActionFromPlan(
                plan_actions)

        buffer_obs = getCurrentObs(in_hands, obs)
        actions_star = torch.cat((actions_star, states.unsqueeze(1)), dim=1)
        envs.stepAsync(actions_star, auto_reset=False)

        if len(replay_buffer) >= training_offset:
            for training_iter in range(training_iters):
                train_step(agent, replay_buffer, logger)

        states_, in_hands_, obs_, rewards, dones = envs.stepWait()
        steps_lefts = envs.getStepLeft()
        obs_ = obs_.permute(0, 3, 1, 2)
        in_hands_ = in_hands_.permute(0, 3, 1, 2)

        done_idxes = torch.nonzero(dones).squeeze(1)
        if done_idxes.shape[0] != 0:
            reset_states_, reset_in_hands_, reset_obs_ = envs.reset_envs(
                done_idxes)
            reset_obs_ = reset_obs_.permute(0, 3, 1, 2)
            reset_in_hands_ = reset_in_hands_.permute(0, 3, 1, 2)
            for j, idx in enumerate(done_idxes):
                states_[idx] = reset_states_[j]
                in_hands_[idx] = reset_in_hands_[j]
                obs_[idx] = reset_obs_[j]

        buffer_obs_ = getCurrentObs(in_hands_, obs_)

        for i in range(num_processes):
            if alg.find('dagger') >= 0:
                replay_buffer.add(
                    ExpertTransition(states[i], buffer_obs[i],
                                     planner_actions_star_idx[i], rewards[i],
                                     states_[i], buffer_obs_[i], dones[i],
                                     steps_lefts[i], torch.tensor(is_expert)))
            else:
                replay_buffer.add(
                    ExpertTransition(states[i], buffer_obs[i],
                                     actions_star_idx[i], rewards[i],
                                     states_[i], buffer_obs_[i], dones[i],
                                     steps_lefts[i], torch.tensor(is_expert)))
        logger.stepBookkeeping(rewards.numpy(), steps_lefts.numpy(),
                               dones.numpy())

        states = copy.copy(states_)
        obs = copy.copy(obs_)
        in_hands = copy.copy(in_hands_)

        if (time.time() - start_time) / 3600 > time_limit:
            break

        if not no_bar:
            timer_final = time.time()
            description = 'Steps:{}; Reward:{:.03f}; Explore:{:.02f}; Loss:{:.03f}; Time:{:.03f}'.format(
                logger.num_steps, logger.getCurrentAvgReward(1000), eps,
                float(logger.getCurrentLoss()), timer_final - timer_start)
            pbar.set_description(description)
            timer_start = timer_final
            pbar.update(logger.num_episodes - pbar.n)
        logger.num_steps += num_processes

        if logger.num_steps % (num_processes * save_freq) == 0:
            saveModelAndInfo(logger, agent)

    saveModelAndInfo(logger, agent)
    logger.saveCheckPoint(args, envs, agent, replay_buffer)
    envs.close()
Exemple #9
0
def _run_ILPOPL(sess, policy_banks, spec_params, tester, curriculum,
                show_print, render):
    # Initializing parameters
    training_params = tester.training_params
    testing_params = tester.testing_params

    # Initializing the game
    env = Game(spec_params)
    agents = env.agents
    action_set = env.get_actions(agents[0])

    # Initializing experience replay buffers
    replay_buffers = {}
    for agent in range(env.n_agents):
        replay_buffers[str(agent)] = ReplayBuffer(training_params.replay_size)

    # Initializing parameters
    num_features = len(env.get_observation(agents[0]))
    max_steps = training_params.max_timesteps_per_spec
    exploration = LinearSchedule(schedule_timesteps=int(
        training_params.exploration_frac * max_steps),
                                 initial_p=1.0,
                                 final_p=training_params.final_exploration)
    last_ep_rew = 0
    training_reward = 0
    episode_count = 0
    # Starting interaction with the environment
    if show_print: print("Executing", max_steps, "actions...")
    if render: env.show_map()

    #We start iterating with the environment
    for t in range(max_steps):
        # Getting the current state and ltl goal
        actions = []
        ltl_goal = env.get_LTL_goal()
        for agent, policy_bank in zip(agents.values(), policy_banks.values()):
            s1 = env.get_observation(agent)

            # Choosing an action to perform
            if random.random() < exploration.value(t):
                act = random.choice(action_set)
            else:
                act = Actions(
                    policy_bank.get_best_action(ltl_goal,
                                                s1.reshape((1, num_features))))
            actions.append(act)
        # updating the curriculum
        curriculum.add_step()

        # Executing the action
        reward = env.execute_actions(actions)
        training_reward += reward

        if render and episode_count % 30 is 0:
            time.sleep(0.01)
            clear_screen()
            env.show_map()

        true_props = []
        for agent in agents.values():
            true_props.append(env.get_true_propositions(agent))
        # Saving this transition
        for agent, policy_bank, replay_buffer, act in zip(
                agents.values(), policy_banks.values(),
                replay_buffers.values(), actions):
            s2 = env.get_observation(agent)
            next_goals = np.zeros((policy_bank.get_number_LTL_policies(), ),
                                  dtype=np.float64)
            for ltl in policy_bank.get_LTL_policies():
                ltl_id = policy_bank.get_id(ltl)
                if env.env_game_over:
                    # env deadends are equal to achive the 'False' formula
                    ltl_next_id = policy_bank.get_id("False")
                else:
                    for props in true_props:
                        ltl_next_id = policy_bank.get_id(\
                          policy_bank.get_policy_next_LTL(ltl, props))
                next_goals[ltl_id - 2] = ltl_next_id
            replay_buffer.add(s1, act.value, s2, next_goals)

            # Learning
            if curriculum.get_current_step() > training_params.learning_starts\
             and curriculum.get_current_step() %\
             training_params.values_network_update_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                S1, A, S2, Goal = replay_buffer.sample(
                    training_params.batch_size)
                policy_bank.learn(S1, A, S2, Goal)

            # Updating the target network
            if curriculum.get_current_step() > training_params.learning_starts\
             and curriculum.get_current_step() %\
             training_params.target_network_update_freq == 0:
                # Update target network periodically.
                policy_bank.update_target_network()

        # Printing
        if show_print and (curriculum.get_current_step()+1) \
             % training_params.print_freq == 0:
            print("Step:",
                  curriculum.get_current_step() + 1, "\tLast episode reward:",
                  last_ep_rew, "\tSucc rate:",
                  "%0.3f" % curriculum.get_succ_rate(),
                  "\tNumber of episodes:", episode_count)

        # Testing
        if testing_params.test and curriculum.get_current_step() %\
                  testing_params.test_freq == 0:
            tester.run_test(curriculum.get_current_step(), sess, _test_ILPOPL,
                            policy_banks, num_features)

        # Restarting the environment (Game Over)
        if env.ltl_game_over or env.env_game_over:
            # NOTE: Game over occurs for one of three reasons:
            # 1) DFA reached a terminal state,
            # 2) DFA reached a deadend, or
            # 3) The agent reached an environment deadend (e.g. a PIT)
            env = Game(spec_params)  # Restarting
            agents = env.agents
            episode_count += 1
            last_ep_rew = training_reward

            training_reward = 0
            # updating the hit rates
            curriculum.update_succ_rate(t, reward)
            # Uncomment if want to stop learning according to succ. rate
            # if curriculum.stop_spec(t):
            # 	last_ep_rew = 0
            # 	if show_print: print("STOP SPEC!!!")
            # 	break

        # checking the steps time-out
        if curriculum.stop_learning():
            if show_print: print("STOP LEARNING!!!")
            break

    if show_print:
        print("Done! Last reward:", last_ep_rew)
def main():
    L.configure('/home/metalabadmin/exp/freeway',
                format_strs=['stdout', 'csv', 'tensorboard'])
    env = gym.make('Freeway-v0')
    env = wrapper.wrap_deepmind(env, frame_stack=True, scale=True)

    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
    network = Q_network(env.observation_space,
                        env.action_space.n,
                        optimizer,
                        gamma=0.99,
                        scope='freeway')
    m_controller = MetaController(network, env.action_space.n)
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(0.1 * 1e7),
                                 initial_p=1.0,
                                 final_p=0.02)
    replay = ReplayBuffer(50000)
    # get default tf_session
    sess = U.get_session()
    U.initialize()
    sess.run(m_controller.network.update_target_op)
    step = 0
    episodes = 0
    rewards = 0
    mean_100ep_reward = 0
    total_reward = []
    saved_mean_reward = None
    ob = env.reset()

    while step <= 1e7:
        ep = exploration.value(step)
        ob_reshaped = np.reshape(ob, (1, ) + env.observation_space.shape)
        act = m_controller.sample_act(sess, ob_reshaped, update_eps=ep)[0]
        ob_tp1, reward_t, done_t, info = env.step(act)
        env.render()
        rewards += reward_t
        replay.add(ob, act, reward_t, ob_tp1, float(done_t))
        ob = ob_tp1

        # train every 4 steps
        if step >= 1000 and step % 4 == 0:
            obs, acts, rewards_t, obs_tp1, dones_t = replay.sample(64)
            weights, batch_idxes = np.ones_like(rewards_t), None
            # get q estimate for tp1 as 'supervised'
            obs_tp1_reshaped = np.reshape(obs_tp1,
                                          (64, ) + env.observation_space.shape)
            q_tp1 = m_controller.get_q(sess, obs_tp1_reshaped)[0]
            td_error = m_controller.train(sess, obs, acts, rewards_t, obs_tp1,
                                          dones_t, weights, q_tp1)

        step += 1

        if step >= 1000 and step % 1000 == 0:
            sess.run(m_controller.network.update_target_op)

        if done_t:
            ob = env.reset()
            total_reward.append(rewards)
            episodes += 1
            rewards = 0
            print('step %d done %s, ep %.2f' % (step, str(done_t), ep))
            mean_100ep_reward = round(np.mean(total_reward[-101:-1]), 1)
            if episodes % 10 == 0 and episodes != 0:
                print('date time %s' % str(datetime.now()))
                L.record_tabular("steps", step)
                L.record_tabular("episodes", episodes)
                L.record_tabular("mean 100 episode reward", mean_100ep_reward)
                L.dump_tabular()

        if step % 1000 == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".
                      format(saved_mean_reward, mean_100ep_reward))
                U.save_variables('./freewaymodel.ckpt')
                model_saved = True
                saved_mean_reward = mean_100ep_reward
class Agent_PER():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

    Params
    ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        seed (int): random seed
    """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # Create the replay buffer
        self.memory = PrioritizedReplayBuffer(BUFFER_SIZE,
                                              alpha=PRIORITIZED_REPLAY_ALPHA)
        if PRIORITIZED_REPLAY_BETA_ITERS is None:
            prioritized_replay_beta_iters = N_EPISODES
        self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                            initial_p=PRIORITIZED_REPLAY_BETA0,
                                            final_p=1.0)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.t = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, float(done))
        self.t += 1

        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        # Update target network periodically
        if self.t_step == 0:
            udpate_target_network_flag = True
        else:
            udpate_target_network_flag = False

        if self.t > LEARNING_STARTS and (self.t % TRAIN_FREQ) == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer
            experiences = self.memory.sample(BATCH_SIZE,
                                             beta=self.beta_schedule.value(
                                                 self.t))
            td_errors = self.learn(experiences, GAMMA,
                                   udpate_target_network_flag)
            (states, actions, rewards, next_states, dones, weights,
             batch_idxes) = experiences
            new_priorities = np.abs(td_errors) + PRIORITIZED_REPLAY_EPS
            self.memory.update_priorities(batch_idxes, new_priorities)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

    Params
    ======
        state (array_like): current state
        eps (float): epsilon, for epsilon-greedy action selection
    """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, udpate_target_network_flag):
        """Update value parameters using given batch of experience tuples.

    Params
    ======
        experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
        gamma (float): discount factor
    """
        states, actions, rewards, next_states, dones, weights, batch_idxes = experiences
        states = torch.from_numpy(np.vstack([state for state in states
                                             ])).float().to(device)
        actions = torch.from_numpy(np.vstack([action for action in actions
                                              ])).long().to(device)
        rewards = torch.from_numpy(np.vstack([reward for reward in rewards
                                              ])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([next_state
                       for next_state in next_states])).float().to(device)
        dones = torch.from_numpy(np.vstack([done for done in dones
                                            ])).float().to(device)
        weights = torch.from_numpy(np.vstack([weight for weight in weights
                                              ])).float().to(device)

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute td error
        td_error = Q_expected - Q_targets
        td_error_ = td_error.detach().numpy()
        # Compute loss
        loss = td_error**2
        #loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        (weights * loss).mean().backward()
        self.optimizer.step()

        if udpate_target_network_flag:
            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        return td_error_

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target

    Params
    ======
        local_model (PyTorch model): weights will be copied from
        target_model (PyTorch model): weights will be copied to
        tau (float): interpolation parameter
    """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)