Beispiel #1
0
    def run_epoch(self, sess, epoch, train_set, lr_schedule):
        """
        Performs an epoch of training
        """
        # for logging
        tic = time.time()
        losses = 0 
        nbatches = (len(train_set) + self.config.batch_size - 1) / self.config.batch_size
        prog = Progbar(target=nbatches)
        # iterate over minibatches
        for i, (img, formula) in enumerate(minibatches(train_set, self.config.batch_size)):
            # get feed dict
            fd = self.get_feed_dict(img, training=True, formula=formula, lr=lr_schedule.lr,
                                    dropout=self.config.dropout)
            # update step
            loss_eval, _, summary = sess.run([self.loss, self.train_op, self.merged], feed_dict=fd)
            self.file_writer.add_summary(summary, epoch*nbatches + i)
            losses += loss_eval

            # logging
            prog.update(i + 1, 
                    values=[("loss", loss_eval), ("perplexity", np.exp(loss_eval))],
                    exact=[("lr", lr_schedule.lr)])

            # update learning rate
            lr_schedule.update(batch_no=epoch*nbatches + i)
        
        toc = time.time()
        self.config.logger.info("Epoch {} - time: {:04.2f}, loss: {:04.4f}, lr: {:04.5f}".format(
                        epoch, toc-tic, losses / float(max(i, 1)), lr_schedule.lr))
    def train(self, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        self.init_averages()

        t = last_eval = curri_idx = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time

        prog = Progbar(target=self.config.nsteps_train)
        rcopy = RepeatCopy(num_bits=self.config.num_bits,
                           batch_size=self.config.batch_size,
                           min_length=self.config.min_length,
                           max_length=self.config.max_length,
                           min_repeats=self.config.min_repeats,
                           max_repeats=self.config.max_repeats)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            config = self.config
            batch_data = rcopy()

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon,
                                                   batch_data)

            # logging stuff
            if ((t % config.log_freq == 0)
                    and (t % config.learning_freq == 0)):
                self.update_averages(scores_eval)
                lr_schedule.update(t)
                prog.update(t + 1,
                            exact=[("Loss", loss_eval), ("Grads", grad_eval),
                                   ("lr", lr_schedule.epsilon)])

            if t >= config.nsteps_train:
                break

            if last_eval >= config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Beispiel #3
0
    def _run_epoch(self, config, train_set, val_set, epoch, lr_schedule):
        """Performs an epoch of training

        Args:
            config: Config instance
            train_set: Dataset instance
            val_set: Dataset instance
            epoch: (int) id of the epoch, starting at 0
            lr_schedule: LRSchedule instance that takes care of learning proc

        Returns:
            score: (float) model will select weights that achieve the highest
                score

        """
        # logging
        batch_size = config.batch_size
        nbatches = (len(train_set) + batch_size - 1) // batch_size
        prog = Progbar(nbatches)

        # iterate over dataset
        for i, (img, formula) in enumerate(minibatches(train_set, batch_size)):
            # get feed dict
            fd = self._get_feed_dict(img,
                                     training=True,
                                     formula=formula,
                                     lr=lr_schedule.lr,
                                     dropout=config.dropout)

            # update step
            _, loss_eval = self.sess.run([self.train_op, self.loss],
                                         feed_dict=fd)
            prog.update(i + 1, [("loss", loss_eval),
                                ("perplexity", np.exp(loss_eval)),
                                ("lr", lr_schedule.lr)])

            # update learning rate
            lr_schedule.update(batch_no=epoch * nbatches + i)

        # logging
        self.logger.info("- Training: {}".format(prog.info))

        # evaluation
        config_eval = Config({
            "dir_answers": self._dir_output + "formulas_val/",
            "batch_size": config.batch_size
        })
        scores = self.evaluate(config_eval, val_set)
        score = scores[config.metric_val]
        lr_schedule.update(score=score)

        return score
Beispiel #4
0
def train():
    #s, _, loss, y = autoencoder()
    s, _, loss, y, recon_loss, KL = vae()
    train_op, grad_norm = add_optimizer_op(loss)

    if not os.path.exists(config.output_path):
        os.makedirs(config.output_path)
    logger = get_logger(config.log_path)
    
    train_data, eval_data = load_data() 

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    for i in xrange(0, config.epoch_num):
        # each epoch

        #train
        prog = Progbar(target=1 + int(len(train_data) / config.batch_size)) 
        step = 1
        for batch in minibatches(train_data, config.batch_size):
            loss_eval, grad_norm_eval, y_train, _, recon_loss_train, KL_train = sess.run([loss, grad_norm, y, train_op, recon_loss, KL], feed_dict={s: batch})
            #prog.update(step, [("train loss", loss_eval), ("grad norm", grad_norm_eval)])
            prog.update(step, [("train loss", loss_eval), ("grad norm", grad_norm_eval), ('recon loss', recon_loss_train), ('VLBO', KL_train)])
            step += 1
	plt.imshow(y_train[0,:,:,0], cmap='Greys')
	plt.savefig('y.png')

        #eval
        #prog = Progbar(target=1 + int(len(eval_data) / config.batch_size)) 
        #step = 1
        #losses = []
        #for batch in minibatches(eval_data, config.batch_size):
        #    loss_eval = sess.run(loss, feed_dict={s: batch})
        #    prog.update(step, [("eval loss", loss_eval)])
        #    losses.append(loss_eval)
        #    step += 1
        #avg_loss = np.mean(losses)
        #sigma_loss = np.sqrt(np.var(losses) / len(losses))
        #print ""
        #msg = "Average loss: {:04.2f} +/- {:04.2f}".format(avg_loss, sigma_loss)
        #logger.info(msg)

        save(sess)
Beispiel #5
0
  def train(self, sess, summary_op):
    allBatches = get_batches(self.all_data, self.batch_size, True, toy)
    if toy:
      prog = Progbar(target=(len(self.all_data)/2) / self.batch_size)
    else:
      prog = Progbar(target=(len(self.all_data[0])) / self.batch_size)

    fetches = [self.train_op, self.loss, summary_op]    # array of desired outputs

    for i, batch in enumerate(allBatches):
      # if i > 10:
      #   break
      if toy:
        questions, answers = batch[0], batch[1]
        enc_seq_len = get_sequence_length(questions)
        dec_seq_len = [self.max_dec_len for sen in answers]
        # get_sequence_length(answers)
        seq_len = {"enc": enc_seq_len, "dec": dec_seq_len}
        # print seq_len
        labels = [ [letter.index(1) for letter in word] for word in answers]
        labels = np.asarray(labels)
        feed_dict = self.create_feed_dict(questions, answers, labels, seq_len)
      else:
        questions_labels, answers_labels = batch[0], batch[1]
        seq_len = {"enc": [len(q) for q in questions_labels], "dec": [len(a) for a in answers_labels]}
        # Pad them to be of particular size.
        questions_labels = [self.addPaddingEnc(q) for q in questions_labels]
        answers_labels = [self.addPaddingDec(a) for a in answers_labels]
        feed_dict = self.create_feed_dict_embeddings(questions_labels, answers_labels, seq_len)

      _, loss, summary = sess.run(fetches, feed_dict)
      prog.update(i + 1, [("train loss", loss)])
    def init_agent(self, id_, game_type):
        super(DQNAgent, self).init_agent(id_, game_type)

        # Assume the graph has been constructed.
        # Create a tf Session and run initializer of variables.
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        self._session = tf.Session(config=tf_config)

        # Tensorboard
        self._add_summary()

        # Initialize all variables.
        init = tf.global_variables_initializer()
        self._session.run(init)

        # Synchronise q and target_q networks.
        self._session.run(self._update_target_op)

        # for saving networks weights
        self._saver = tf.train.Saver()

        # Initialize replay buffer and variables.
        self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history)
        self._train_rewards = deque(maxlen=self._config.num_episodes_test)
        self._train_max_q_values = deque(maxlen=1000)
        self._train_q_values = deque(maxlen=1000)
        self._init_averages()

        self._time_step = 0
        self._progress_bar = Progbar(target=self._config.nsteps_train)

        self._has_episode_started = False

        if not self._train_from_scratch:
            self._load()
    def train(self, model_i, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        self.init_averages()

        t = last_eval = curri_idx = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            encoding_batch = []
            predflag_batch = []
            target_action_batch = []
            slen_batch = []
            max_len = 0
            for i in range(self.config.batch_size):
                #config = self.config
                #config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[curri_idx]
                #self.env.reset(config) # h x w x c
                encoding, target_action, predflag = model_i.gen_sample_seq(self.config.ndigits, self.config.nway)
                encoding_batch.append(encoding[None])
                predflag_batch.append(predflag[None])
                target_action_batch.append(target_action[None])
                slen_batch.append(encoding.shape[0])
                if encoding.shape[0]>max_len:
                    max_len = encoding.shape[0]

            batch_data = DatasetTensors(np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in encoding_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in target_action_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1]])], axis=1) for x in predflag_batch], axis=0), np.array(slen_batch).astype('int32'))

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data)

            # logging stuff
            if ((t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)):
                self.update_averages(scores_eval)
                lr_schedule.update(t)
                prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)])

            if t >= self.config.nsteps_train:
                break

            if last_eval >= self.config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_i)]
                '''
                if scores_eval[-1]>0.8:
                    curri_idx += 1
                    msg = "Upgrade to lesson {:d}".format(int(curri_idx))
                    self.logger.info(msg)
                    self.logger.info("----------Start Computing Final Score----------")
                    scores_eval += [self.evaluate(model_i)]
                    self.logger.info("----------Finish Computing Final Score----------")
                '''

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(cr_schedule)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
  def train(self, exp_schedule, lr_schedule):
    # Initialize replay buffer and variables
    replay_buffer = ReplayBufferAC(self.FLAGS.buffer_size, self.FLAGS.state_hist)
    rewards = deque(maxlen=self.FLAGS.num_test)
    max_q_values = deque(maxlen=1000)
    q_values = deque(maxlen=1000)
    self.init_averages()

    t = 0 # time control of nb of steps
    loss_eval = grad_eval = 0
    scores_eval = [] # list of scores computed at iteration time
    #scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

    self.prog = Progbar(target=self.FLAGS.train_steps)
    self.prog2 = Progbar(target=self.FLAGS.train_steps)

    # Train for # of train steps
    while t < self.FLAGS.train_steps:
      total_reward = 0
      ep_len = 0
      state = self.env.reset()
      reward = 0
      first = 1
      q_input = None
      # Run for 1 episode and update the buffer
      while True:
        ep_len += 1
        # replay memory stuff
        if first == 1:
          first = 0
          idx     = replay_buffer.store_frame(state)
          q_input = replay_buffer.encode_recent_observation()
        # chose action according to current Q and exploration
        best_action, q_values = self.network.get_best_action(q_input)
        action                = exp_schedule.get_action(best_action)
        orig_val = self.network.calcState(q_input)


        # store q values
        max_q_values.append(max(q_values))
        q_values += list(q_values)

        # perform action in env
        new_state, new_reward, done, info = self.env.step(action)
        idx = replay_buffer.store_frame(state)
        q_input = replay_buffer.encode_recent_observation()
        new_val = self.network.calcState(q_input)
        orig_val = orig_val[0][0]
        new_val = new_val[0][0]
        print (orig_val, new_reward, done, new_val, ep_len)


        if not done: # Non-terminal state.
          target = reward + ( self.FLAGS.gamma * new_val)
        else:
          target = reward + ( self.FLAGS.gamma * new_reward )

        best_val = max((orig_val), target)

        actor_delta = new_val - orig_val

        replay_buffer.store_effect(idx-1, action, new_reward, done, best_val, actor_delta)
        state = new_state

        if done:
          replay_buffer.store_effect(idx, action, 0, done, 0, 0)

        # Count reward
        total_reward += new_reward

        reward=new_reward

        # Stop at end of episode
        if done: break

      old_t = t
      temp_ep_len = ep_len
      while True:
        t += 1
        temp_ep_len -= 1

        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)):
          if replay_buffer.can_sample(self.FLAGS.batch_size) == True: 
            loss_eval, grad_eval = self.network.update_critic_step(t, replay_buffer, lr_schedule.epsilon, self.summary)


        # Update logs if necessary
        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)):
          self.update_logs2(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon)

        if temp_ep_len <= 0 or t >= self.FLAGS.train_steps: break


      rewards.append(total_reward)

      # Learn using replay
      while True:

        t += 1
        ep_len -= 1

        # Make train step if necessary
        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)):
          if replay_buffer.can_sample(self.FLAGS.batch_size) == True: 
            loss_eval, grad_eval = self.network.update_actor_step(t, replay_buffer, lr_schedule.epsilon, self.summary)
            exp_schedule.update(t)
            lr_schedule.update(t)

        # Update logs if necessary
        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)):
          self.update_averages(rewards, max_q_values, q_values, scores_eval)
          self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon)

        # Update logs if necessary
        elif (t < self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0):
          sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.FLAGS.learn_start))
          sys.stdout.flush()

        if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)):
          # Evaluate current model
          scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

          # Save current Model
          self.network.save()

          # Record video of current model
          if self.FLAGS.record:
            self.record()

        if ep_len <= 0 or t >= self.FLAGS.train_steps: break

      # Update episodic rewards

    # End of training
    self.logger.info("- Training done.")
    self.network.save()
    scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]
    export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        model_a.env.state.is_render_image = model_a.config.render_train
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]

        npath = self.config.npath # paths to generate in each environment
        nquery = self.config.nquery # query to generate in each environment
        max_plan_len = self.config.max_plan_len
        ndigits = self.config.ndigits
        nway = self.config.nway

        num_classes = len(self.env.state.xmap.item_class_id)

        # three steps:
        #   1. sample paths from the teacher environment and pass to dnc
        #   2. get immediate reward from whether agent could reach the subgoal
        #   3. sample query paths and ask agent to follow the plan, get the final big reward
        #   -- train one step after each teacher's move
        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            self.env.reset()
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            dnc_state = DNC.zero_state(self.config, batch_size=1)
            h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0

            # sample paths
            for i in range(npath):
                state_seq, path_loc, path_ori = self.env.teacher.gen_sample_seq(self.env.state)
                state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway)
                goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool')
                #### missing could be everything ####
                goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq
                #### treat missing observation as correct observation ####
                goal_state_seq[:,:,:,:,num_classes] = True
                #### transpose
                goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1])
                path_len = state_seq.shape[0]
                mask_seq = np.logical_not(state_seq[:,:3,:,num_classes])
                flag_seq = np.zeros([path_len])
                flag_seq[-1] = 1
                model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]])

                for j in range(path_len):
                    # get agate from dnc
                    cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0)
                    agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    agate_dnc_val = agate_dnc_val[0,0]
                    # get q value and sample action
                    idx = replay_buffer.store_frame(state_seq[j])
                    q_input = replay_buffer.encode_recent_observation()
                    best_action, q_values, h_state = self.get_best_action([q_input], h_state, slen, [action], [agate_dnc_val])
                    action = exp_schedule.get_action(best_action)
                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)
                    # take action and update dnc
                    cur_dnc_in[-2] = action
                    dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    # acquire reward
                    reward = 0
                    done = False
                    if action==1:
                        h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                        model_a.env.teacher.set_goal(goal_state_seq[j], path_loc[j])
                        reward_a = model_a.navi_goal(h_state_a, goal_state_seq[j])
                        if not model_a.env.teacher.goal_finish:
                            reward += -0.05
                        reward += -0.05
                        model_a.env.state.teleport(model_a.env.agent, path_loc[j], orientation_map[path_ori[j]])
                    # acquire final reward
                    if i==npath-1 and j==path_len-1:
                        done = True
                        reward_list = list()
                        for k in range(nquery):
                            reward_list.append(0)
                            src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state)
                            src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway)
                            tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway)
                            path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], 
                                self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len})
                            path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2)
                            target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2)
                            path_dnc_val_inner = np.argmax(path_dnc_val, axis=3)
                            target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2)
                            cur_len = max_plan_len
                            for l in range(max_plan_len):
                                if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all():
                                    cur_len = l+1
                                    break
                            path_dnc_val = path_dnc_val[:cur_len]
                            path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0)
                            #### modify goal state ####
                            #### missing could be everything ####
                            path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val
                            #### treat missing observation as correct observation ####
                            path_dnc_val[:,:,:,num_classes] = True
                            model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1]))
                            h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                            for l in range(path_dnc_val.shape[0]):
                                cur_goal_state = path_dnc_val[l]
                                cur_goal_state = np.expand_dims(cur_goal_state, 3)
                                cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1),
                                    np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3)
                                model_a.env.teacher.set_goal(cur_goal_state, tgt_loc)
                                reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state)
                            if model_a.env.teacher.goal_finish:
                                reward_list[-1] += 10
                        reward += sum(reward_list)/len(reward_list)
                    # store everything into replay buffer
                    replay_buffer.store_effect(idx, action, agate_dnc_val, reward, done)

                    t += 1
                    last_eval += 1
                    last_record += 1

                    # perform a training step
                    loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)

                    # logging stuff
                    if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                       (t % self.config.learning_freq == 0)):
                        self.update_averages(rewards, max_q_values, q_values, scores_eval)
                        exp_schedule.update(t)
                        lr_schedule.update(t)
                        if len(rewards) > 0:
                            prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                            ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                            ("Grads", grad_eval), ("Max Q", self.max_q), 
                                            ("lr", lr_schedule.epsilon)])

                    elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                        sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                            self.config.learning_start))
                        sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Beispiel #10
0
    def run_epoch(self, session, train, val, logger):
        num_samples = len(train["context"])
        num_batches = int(
            np.ceil(num_samples) * 1.0 / self.config.training.batch_size)

        progress = Progbar(target=num_batches)
        best_f1 = 0
        losses = []
        for i, train_batch in enumerate(
                batches(train,
                        is_train=True,
                        batch_size=self.config.training.batch_size,
                        window_size=self.config.training.window_size)):
            _, loss = self.optimize(session, train_batch)
            losses.append(loss)
            progress.update(i, [("training loss", np.mean(losses))])

            if i % self.config.training.eval_num == 0 or i == num_batches:

                # Randomly get some samples from the dataset
                train_samples = get_random_samples(
                    train, self.config.training.samples_used_for_evaluation)
                val_samples = get_random_samples(
                    val, self.config.training.samples_used_for_evaluation)

                # First evaluate on the training set for not using best span
                f1_train, EM_train = self.evaluate_answer(session,
                                                          train_samples,
                                                          use_best_span=False)

                # Then evaluate on the val set
                f1_val, EM_val = self.evaluate_answer(session,
                                                      val_samples,
                                                      use_best_span=False)

                logging.info("Not using best span")
                logging.info("F1: {}, EM: {}, for {} training samples".format(
                    f1_train, EM_train,
                    self.config.training.samples_used_for_evaluation))
                logging.info(
                    "F1: {}, EM: {}, for {} validation samples".format(
                        f1_val, EM_val,
                        self.config.training.samples_used_for_evaluation))

                # First evaluate on the training set
                f1_train, EM_train = self.evaluate_answer(session,
                                                          train_samples,
                                                          use_best_span=True)

                # Then evaluate on the val set
                f1_val, EM_val = self.evaluate_answer(session,
                                                      val_samples,
                                                      use_best_span=True)

                logging.info("Using best span")
                logging.info("F1: {}, EM: {}, for {} training samples".format(
                    f1_train, EM_train,
                    self.config.training.samples_used_for_evaluation))
                logging.info(
                    "F1: {}, EM: {}, for {} validation samples".format(
                        f1_val, EM_val,
                        self.config.training.samples_used_for_evaluation))

                summaries_dict = {
                    "f1_train": f1_train,
                    "EM_train": EM_train,
                    "f1_val": f1_val,
                    "EM_val": EM_val,
                    "training_loss": np.mean(losses)
                }

                logger.add_scalar_summary(
                    self.cur_epoch_tensor.eval(session) * num_batches + i,
                    summaries_dict)

                if f1_val > best_f1:
                    self.save(session)
                    best_f1 = f1_val
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        last_frames = deque(maxlen=4)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += []
        embeddings = []
        extractor = PongExtractor()

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < 2000:
            total_reward = 0
            state = self.env.reset()
            last_frame = state
            last_frames.append(state)
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                feats = extractor.extract(np.squeeze(state))
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                embedding = self.sess.run(self.hidden,
                                          feed_dict={self.s: [q_input]})[0]
                # embedding = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0]
                # print embedding.shape
                embeddings.append(embedding)

                action = best_action

                frame = np.squeeze(state)
                scipy.misc.imsave(
                    'embeddings/breakout/breakout{}.png'.format(t), frame)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state
                total_reward += reward
                if done or t >= 2000:
                    print total_reward, t
                    break
            # updates to perform at the end of an episode
            rewards.append(total_reward)

        # last words
        print 'Saving embeddings'
        np.save(open('embeddings/breakout/breakout.npy', 'w'),
                np.vstack(embeddings))
class DQNAgent(BaseAgent):
    non_terminal_reward = 0

    def __init__(self, env, config, exp_schedule, lr_schedule, is_training_agent, train_from_scratch=False,
                 reward_after_somebody_died=False,
                 logger=None):
        """
        Initialize Q Network and env

        :param env: Game environment
        :param config: config(hyper-parameters) instance
        :param logger: logger instance from logging module
        :param exp_schedule: exploration strategy for epsilon
        :param lr_schedule: schedule for learning rate
        """
        super(DQNAgent, self).__init__()

        # Variables initialized in _build
        self._states = None
        self._actions = None
        self._rewards = None
        self._next_states = None
        self._done_mask = None
        self._learning_rate = None
        self._q_values = None
        self._target_q_values = None
        self._next_q_values = None
        self._update_target_op = None
        self._loss = None
        self._train_op = None
        self._grad_norm = None

        # Variables initialized in init_agent
        self._session = None
        self._avg_reward_placeholder = None
        self._max_reward_placeholder = None
        self._std_reward_placeholder = None
        self._avg_q_placeholder = None
        self._max_q_placeholder = None
        self._std_q_placeholder = None
        # TODO: Commented due to lack of evaluate()
        # self._eval_reward_placeholder = None
        self._merged = None
        self._file_writer = None
        self._saver = None
        self._train_replay_buffer = None
        self._train_rewards = None
        self._train_max_q_values = None
        self._train_q_values = None
        self._avg_reward = None
        self._max_reward = None
        self._std_reward = None
        self._avg_q = None
        self._max_q = None
        self._std_q = None
        # TODO: Commented due to lack of evaluate()
        # self._eval_reward = None
        self._time_step = None
        self._progress_bar = None
        self._has_episode_started = None

        # Variables initialized in act.
        self._last_action = None
        self._last_idx = None
        self._enemy_count = None

        # Directory for training outputs
        if not os.path.exists(config.output_path):
            os.makedirs(config.output_path)

        self._logger = logger
        if logger is None:
            self._logger = get_logger(config.log_path)

        self._config = config
        self._env = env
        self._exp_schedule = exp_schedule
        self._lr_schedule = lr_schedule
        self._is_training_agent = is_training_agent
        self._train_from_scratch = train_from_scratch
        self._reward_after_somebody_died = reward_after_somebody_died
        self._total_reward = 0

        # Build model.
        self._build()

    def init_agent(self, id_, game_type):
        super(DQNAgent, self).init_agent(id_, game_type)

        # Assume the graph has been constructed.
        # Create a tf Session and run initializer of variables.
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        self._session = tf.Session(config=tf_config)

        # Tensorboard
        self._add_summary()

        # Initialize all variables.
        init = tf.global_variables_initializer()
        self._session.run(init)

        # Synchronise q and target_q networks.
        self._session.run(self._update_target_op)

        # for saving networks weights
        self._saver = tf.train.Saver()

        # Initialize replay buffer and variables.
        self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history)
        self._train_rewards = deque(maxlen=self._config.num_episodes_test)
        self._train_max_q_values = deque(maxlen=1000)
        self._train_q_values = deque(maxlen=1000)
        self._init_averages()

        self._time_step = 0
        self._progress_bar = Progbar(target=self._config.nsteps_train)

        self._has_episode_started = False

        if not self._train_from_scratch:
            self._load()

    def act(self, obs, action_space):
        state = obs['board'][:, :, None]

        if not self._is_training_agent:
            # Act greedily when testing.
            if self._has_episode_started:
                self._train_replay_buffer.store_effect(
                    self._last_idx,
                    self._last_action,
                    0,
                    done=False
                )

            self._last_idx = self._train_replay_buffer.store_frame(state)
            q_input = self._train_replay_buffer.encode_recent_observation()
            action = self._get_action(q_input)
            self._last_action = action

            return action

        if self._has_episode_started:
            reward = DQNAgent.non_terminal_reward

            if self._reward_after_somebody_died:
                if len(self._character.enemies) < self._enemy_count:
                    reward = 1

            self._train(reward, done=False)

        self._enemy_count = len(self._character.enemies)
        self._time_step += 1

        # Replay buffer
        idx = self._train_replay_buffer.store_frame(state)
        q_input = self._train_replay_buffer.encode_recent_observation()

        # Choose action according to current Q and exploration
        best_action, self._train_q_values = self._get_best_action(q_input)
        action = self._exp_schedule.get_action(best_action)

        self._train_max_q_values.append(max(self._train_q_values))
        self._train_q_values += list(self._train_q_values)

        self._last_action = action
        self._last_idx = idx

        if not self._has_episode_started:
            self._has_episode_started = True

        return action

    def episode_end(self, reward):
        """
        Updates to perform at the end of an episode
        """
        # Reset episode.
        self._has_episode_started = False

        if not self._is_training_agent:
            return

        self._train(reward, done=True)
        self._train_rewards.append(self._total_reward)

        # Reset total reward.
        self._total_reward = 0

        # TODO: Commented due to lack of evaluate() and record()
        # if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
        #     # evaluate our policy
        #     last_eval = 0
        #     print("")
        #     scores_eval += [self.evaluate()]
        #
        # if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq):
        #     self.logger.info("Recording...")
        #     last_record = 0
        #     self.record()

    def shutdown(self):
        """
        Save trained results
        """
        if not self._is_training_agent:
            return

        self._logger.info("- Training done.")
        self._save()

        # TODO: Commented due to lack of evaluate()
        # scores_eval += [self.evaluate()]
        # DQNAgent.export_plot(scores_eval, "Scores", self.config.plot_output)

    def _train(self, reward, done):
        # Store the transition.
        self._train_replay_buffer.store_effect(
            self._last_idx,
            self._last_action,
            reward,
            done=done
        )

        # Perform a training step.
        loss_eval, grad_eval = self._train_step(
            self._time_step,
            self._train_replay_buffer,
            self._lr_schedule.epsilon
        )

        # Logging
        if self._time_step > self._config.learning_start \
                and self._time_step % self._config.log_freq == 0 \
                and self._time_step % self._config.learning_freq == 0:

            self._update_averages(self._train_rewards, self._train_max_q_values, self._train_q_values)
            self._exp_schedule.update(self._time_step)
            self._lr_schedule.update(self._time_step)
            if len(self._train_rewards) > 0:
                self._progress_bar.update(
                    self._time_step + 1,
                    exact=[
                        ("Loss", loss_eval), ("Avg R", self._avg_reward),
                        ("Max R", np.max(self._train_rewards)),
                        ("eps", self._exp_schedule.epsilon),
                        ("Grads", grad_eval), ("Max Q", self._max_q),
                        ("lr", self._lr_schedule.epsilon)
                    ]
                )

        elif self._time_step < self._config.learning_start and self._time_step % self._config.log_freq == 0:
            sys.stdout.write("\rPopulating the memory {}/{}...".format(self._time_step, self._config.learning_start))
            sys.stdout.flush()

        # Accumulate reward
        self._total_reward += reward

    def _build(self):
        """
        Build model by adding all necessary variables.
        """
        # Add placeholders.
        self._add_placeholders_op()

        # Compute Q values of state.
        states = self._process_state(self._states)
        self._q_values = self._get_q_values_op(states, scope='q', reuse=False)

        # Compute Q values of next state.
        next_states = self._process_state(self._next_states)
        self._target_q_values = self._get_q_values_op(next_states, scope='target_q', reuse=False)

        # for Double DQN
        self._next_q_values = self._get_q_values_op(next_states, scope='q', reuse=True)

        # Add update operator for target network.
        self._add_update_target_op('q', 'target_q')

        # Add square loss.
        self._add_loss_op(self._q_values, self._target_q_values, self._next_q_values)

        # Add optimizer for the main networks.
        self._add_optimizer_op('q')

    def _add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs by the rest of the model building and will be fed
        data during training.  Note that when "None" is in a placeholder's shape, it's flexible
        (so we can use different batch sizes without rebuilding the model
        """
        state_shape = list(self._env.observation_space.shape)

        self._states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history))
        self._actions = tf.placeholder(tf.int32, (None,))
        self._rewards = tf.placeholder(tf.float32, (None,))
        self._next_states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history))
        self._done_mask = tf.placeholder(tf.bool, (None,))
        self._learning_rate = tf.placeholder(tf.float32, ())

    def _process_state(self, state):
        """
        Processing of state

        State placeholders are tf.uint8 for fast transfer to GPU
        Need to cast it to float32 for the rest of the tf graph.

        :param state:
                Node of tf graph of shape = (batch_size, height, width, nchannels) of type tf.uint8.if,
                values are between 0 and 255 -> 0 and 1
        """
        state = tf.cast(state, tf.float32)
        state /= self._config.high

        return state

    def _get_q_values_op(self, state, scope, reuse=False):
        """
        Returns Q values for all actions

        :param state: (tf tensor) shape = (batch_size, img height, img width, nchannels)
        :param scope: (string) scope name, that specifies if target network or not
        :param reuse: (bool) reuse of variables in the scope
        :return out: (tf tensor) of shape = (batch_size, num_actions)
        """
        num_actions = self._env.action_space.n
        out = state

        with tf.variable_scope(scope, reuse=reuse) as _:
            x = layers.conv2d(state, 32, 5, stride=2, padding='SAME')
            x = layers.conv2d(x, 64, 4, stride=2, padding='SAME')
            x = layers.conv2d(x, 64, 3, stride=1, padding='SAME')
            x = layers.flatten(x)
            x = layers.fully_connected(x, 512)
            out = layers.fully_connected(x, num_actions, activation_fn=None)

        return out

    def _add_update_target_op(self, q_scope, target_q_scope):
        """
        update_target_op will be called periodically
        to copy Q network weights to target Q network

        Remember that in DQN, we maintain two identical Q networks with
        2 different set of weights. In tensorflow, we distinguish them
        with two different scopes. One for the target network, one for the
        regular network. If you're not familiar with the scope mechanism
        in tensorflow, read the docs
        https://www.tensorflow.org/programmers_guide/variable_scope

        Periodically, we need to update all the weights of the Q network
        and assign them with the values from the regular network. Thus,
        what we need to do is to build a tf op, that, when called, will
        assign all variables in the target network scope with the values of
        the corresponding variables of the regular network scope.

        :param q_scope: (string) name of the scope of variables for q
        :param target_q_scope: (string) name of the scope of variables
                for the target network
        """
        tar_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=target_q_scope)
        q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=q_scope)
        self._update_target_op = tf.group(*[tf.assign(tar_vars[i], q_vars[i]) for i in range(len(tar_vars))])

    def _add_loss_op(self, q, target_q, next_q):
        """
        Sets the loss of a batch, self.loss is a scalar

        :param q: (tf tensor) shape = (batch_size, num_actions)(Q(s, a))
        :param target_q: (tf tensor) shape = (batch_size, num_actions)(Q_target(s', a'))
        :param next_q: Q(s', a') for Double DQN
        """
        num_actions = self._env.action_space.n
        not_done = 1 - tf.cast(self._done_mask, tf.float32)

        # Double DQN
        # need q_next(Q(s', a')), then find argmax in it
        max_a = tf.argmax(next_q, axis=1)
        q_max = tf.reduce_sum(target_q * tf.one_hot(max_a, num_actions), axis=1)
        q_samp = self._rewards + not_done * self._config.gamma * q_max

        # nature DQN
        q_s = tf.reduce_sum(q * tf.one_hot(self._actions, num_actions), axis=1)
        self._loss = tf.reduce_mean(tf.square(q_samp - q_s))

    def _add_optimizer_op(self, scope):
        """
        Set self.train_op and self.grad_norm
        """
        optimizer = tf.train.AdamOptimizer(self._learning_rate)
        vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
        grads_and_vars = optimizer.compute_gradients(self._loss, vars)

        clip_grads_and_vars = None
        if self._config.grad_clip:
            clip_grads_and_vars = [(tf.clip_by_norm(gv[0], self._config.clip_val), gv[1]) for gv in grads_and_vars]
        self._train_op = optimizer.apply_gradients(clip_grads_and_vars)
        self._grad_norm = tf.global_norm(clip_grads_and_vars)

    def _add_summary(self):
        """
        Tensorflow stuff
        """
        # extra placeholders to log stuff from python
        self._avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward")
        self._max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward")
        self._std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward")

        self._avg_q_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_q")
        self._max_q_placeholder = tf.placeholder(tf.float32, shape=(), name="max_q")
        self._std_q_placeholder = tf.placeholder(tf.float32, shape=(), name="std_q")

        # TODO: Commented due to lack of evaluate()
        # self._eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward")

        # add placeholders from the graph
        tf.summary.scalar("loss", self._loss)
        tf.summary.scalar("grads norm", self._grad_norm)

        # extra summaries from python -> placeholders
        tf.summary.scalar("Avg Reward", self._avg_reward_placeholder)
        tf.summary.scalar("Max Reward", self._max_reward_placeholder)
        tf.summary.scalar("Std Reward", self._std_reward_placeholder)

        tf.summary.scalar("Avg Q", self._avg_q_placeholder)
        tf.summary.scalar("Max Q", self._max_q_placeholder)
        tf.summary.scalar("Std Q", self._std_q_placeholder)

        # TODO: Commented due to lack of evaluate()
        # tf.summary.scalar("Eval Reward", self._eval_reward_placeholder)

        # logging
        self._merged = tf.summary.merge_all()
        self._file_writer = tf.summary.FileWriter(self._config.output_path,
                                                  self._session.graph)

    def _init_averages(self):
        """
        Define extra attributes for tensorboard.
        """
        self._avg_reward = -21.
        self._max_reward = -21.
        self._std_reward = 0

        self._avg_q = 0
        self._max_q = 0
        self._std_q = 0

        # TODO: Commented due to lack of evaluate()
        # self._eval_reward = -21.

    def _get_action(self, obs):
        """
        Returns action with some epsilon strategy

        :param obs: observation from gym
        """
        if np.random.random() < self._config.soft_epsilon:
            return self._env.action_space.sample()
        else:
            return self._get_best_action(obs)[0]

    def _get_best_action(self, obs):
        """
        Return best action

        :param obs: 4 consecutive observations from gym
        :return action: (int)
        :return action_values: (np array) q values for all actions
        """
        action_values = self._session.run(self._q_values, feed_dict={self._states: [obs]})[0]
        return np.argmax(action_values), action_values

    def _train_step(self, t, replay_buffer, lr):
        """
        Perform training step

        :param t: (int) nth step
        :param replay_buffer: buffer for sampling
        :param lr: (float) learning rate
        """
        loss_eval, grad_eval = 0, 0

        # Perform training step
        if t > self._config.learning_start and t % self._config.learning_freq == 0:
            loss_eval, grad_eval = self._update_step(t, replay_buffer, lr)

        # Occasionally update target network with q network
        if t % self._config.target_update_freq == 0:
            self._update_target_params()

        # Occasionally save the weights
        if t % self._config.saving_freq == 0:
            self._save()

        return loss_eval, grad_eval

    def _update_step(self, t, replay_buffer, lr):
        """
        Performs an update of parameters by sampling from replay_buffer

        :param t: number of iteration (episode and move)
        :param replay_buffer: ReplayBuffer instance .sample() gives batches
        :param lr: (float) learning rate
        :return loss: (Q - Q_target) ^ 2
        """
        s_batch, a_batch, r_batch, sp_batch, done_mask_batch = replay_buffer.sample(self._config.batch_size)

        fd = {
            # Inputs
            self._states: s_batch,
            self._actions: a_batch,
            self._rewards: r_batch,
            self._next_states: sp_batch,
            self._done_mask: done_mask_batch,
            self._learning_rate: lr,

            # Extra info
            self._avg_reward_placeholder: self._avg_reward,
            self._max_reward_placeholder: self._max_reward,
            self._std_reward_placeholder: self._std_reward,
            self._avg_q_placeholder: self._avg_q,
            self._max_q_placeholder: self._max_q,
            self._std_q_placeholder: self._std_q,

            # TODO: Commented due to lack of evaluate()
            # self._eval_reward_placeholder: self.eval_reward,
        }

        loss_eval, grad_norm_eval, summary, _ = self._session.run(
            [self._loss, self._grad_norm, self._merged, self._train_op],
            feed_dict=fd
        )

        # Tensorboard
        self._file_writer.add_summary(summary, t)

        return loss_eval, grad_norm_eval

    def _update_target_params(self):
        """
        Update parameters of Q with parameters of Q
        """
        self._session.run(self._update_target_op)

    def _load(self):
        """
        Loads session
        """
        ckpt = tf.train.get_checkpoint_state(self._config.model_output)
        self._saver.restore(self._session, ckpt.model_checkpoint_path)

    def _save(self):
        """
        Saves session
        """
        if not os.path.exists(self._config.model_output):
            os.makedirs(self._config.model_output)

        model_path = os.path.join(self._config.model_output, 'model.ckpt')
        self._saver.save(self._session, model_path)

    def _update_averages(self, rewards, max_q_values, q_values, scores_eval=None):
        """
        Update the averages

        :param rewards: deque
        :param max_q_values: deque
        :param q_values: deque
        :param scores_eval: list
        """
        self._avg_reward = np.mean(rewards)
        self._max_reward = np.max(rewards)
        self._std_reward = np.sqrt(np.var(rewards) / len(rewards))

        self._max_q = np.mean(max_q_values)
        self._avg_q = np.mean(q_values)
        self._std_q = np.sqrt(np.var(q_values) / len(q_values))

        # TODO: Commented due to lack of evaluate()
        # if len(scores_eval) > 0:
        #     self.eval_reward = scores_eval[-1]

    @staticmethod
    def export_plot(y, y_label, filename):
        """
        Export a plot in filename

        :param y: (list) of float / int to plot
        :param filename: (string) directory
        """
        plt.figure()
        plt.plot(range(len(y)), y)
        plt.xlabel("Epoch")
        plt.ylabel(y_label)
        plt.savefig(filename)
        plt.close()
Beispiel #13
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        last_frames = deque(maxlen=4)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += []

        extractor = PongExtractor()

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            last_frame = state
            last_frames.append(state)
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                feats = extractor.extract(np.squeeze(state))
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                embedding = self.sess.run(self.hidden,
                                          feed_dict={self.s: [q_input]})[0]
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                if t % 100 == 0:
                    # print state.shape
                    # frame = np.zeros(np.squeeze(state).shape)
                    # for f in last_frames:
                    #     frame = frame + np.squeeze(f)
                    # frame = frame / len(last_frames)
                    frame = np.squeeze(state)
                    last_frame = np.squeeze(last_frame)
                    pickle.dump(
                        last_frames,
                        open('frames/embedding/atari{}.p'.format(t), 'w'))
                    for i in range(4):
                        f = np.squeeze(last_frames[i])
                        scipy.misc.imsave(
                            'frames/embedding/atari{}.png'.format(t - 3 + i),
                            f)

                    # scipy.misc.imsave('frames/atari{}.png'.format(t-1),last_frame)
                    # posfile = open('frames/atari{}.txt'.format(t),'w')
                    # posfile.write('Opp Paddle:\t{}\n'.format(oppY))
                    # posfile.write('Player Paddle:\t{}\n'.format(playerY))
                    # posfile.write('ball x:\t{}\n'.format(ballX))
                    # posfile.write('ball y:\t{}\n'.format(ballY))
                    # posfile.close()
                    np.savetxt('frames/embedding/pong{}.txt'.format(t),
                               feats,
                               fmt='%.2f')

                # perform action in env
                new_state, reward, done, info = self.env.step(action)
                # print "state shape:",state.shape()

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                last_frame = state
                state = new_state
                last_frames.append(state)

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Beispiel #14
0
    def train(self, exp_schedule, lr_schedule, exp_schedule1, env=None):
        """
        Performs training of Q only on agent 0

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        if env is None:
            env = self.env

        # initialize replay buffer and variables
        rewards = deque(maxlen=self.config.num_episodes_test)
        rewardsB = deque(maxlen=self.config.num_episodes_test)
        self.model_0.rewards = rewards
        self.model_1.rewards = rewardsB
        # self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)
        self.model_0.train_init()
        self.model_1.train_init()

        # next_fire_B = False

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            # need_new_ball = False
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: env.render()

                action_0 = self.model_0.train_step_pre(state, exp_schedule)
                # if exp_schedule.epsilon == 1:
                #     action_1 = exp_schedule.get_action(0,3)  # agent altogether
                # else:
                action_1 = self.model_1.train_step_pre(state[:, ::-1],
                                                       exp_schedule1)
                cur_action = actions.trans(action_0, action_1)

                # perform action in env
                new_state, reward, done, info = env.step(cur_action)

                # print("Reward", reward)

                # Problem
                loss_e0, grad_e0 = self.model_0.train_step_post(
                    reward, done, t, lr_schedule, True)
                self.model_1.train_step_post(-reward, done, t, lr_schedule,
                                             False)
                state = new_state

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    # self.update_averages(rewards, max_q_values, q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[
                                        ("Loss", loss_e0),
                                        ("Avg R", np.mean(rewards)),
                                        ("Max R", np.max(rewards)),
                                        ("Min R", np.min(rewards)),
                                        ("eps", exp_schedule.epsilon),
                                        ("Grads", grad_e0),
                                        ("Max Q",
                                         np.mean(self.model_0.max_q_values)),
                                        ("lr", lr_schedule.epsilon)
                                    ])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)
            rewardsB.append(-total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record(exp_schedule)
                self.model_0.save(t)  # save the models
                self.model_1.save(t)  # save the models

        # last words
        self.logger.info("- Training done.")
        self.model_0.save()  # save the models
        self.model_1.save()  # save the models
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Beispiel #15
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables

        if self.config.use_memory:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size,
                self.config.state_history,
                memory_size=self.config.memory_unit_size)
        else:
            replay_buffer = ReplayBuffer(self.config.buffer_size,
                                         self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()[0]]

        prog = Progbar(target=self.config.nsteps_train)

        evaluation_result_list = []
        oos_evalution_result_list = []

        # interact with environment
        prev_time = time.time()
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                if self.config.use_memory:
                    prev_memory = replay_buffer.encode_recent_memory()
                    best_action, q_values, _, next_memory = self.get_best_action_with_memory(
                        q_input, prev_memory)
                    next_memory = np.squeeze(next_memory)
                else:
                    best_action, q_values = self.get_best_action(q_input)
                # chose action according to current Q and exploration
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                if self.config.use_memory:
                    replay_buffer.store_memory(idx, next_memory)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                time_log_freq = 1000
                if t % time_log_freq == 0:
                    with open(self.config.output_path + 'time_log.txt',
                              'a') as of:
                        of.write('{}\n'.format(time.time() - prev_time))
                        of.write('\n')
                    prev_time = time.time()

                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg_R", self.avg_reward),
                                           ("Max_R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max_Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                score, complete, length = self.evaluate()
                if complete > 0:
                    evaluation_result_list += [(score, complete, length)]
                if score > self.config.extended_eval_threshold:
                    self.logger.info('Extended in-sample evaluation...')
                    self.evaluate(num_episodes=1000)
                    for _ in range(10):
                        self.logger.info(
                            'Extended out-of-sample evaluation...')
                        oos_result = self.evaluate(
                            EnvMaze(n=self.config.maze_size), num_episodes=100)
                        oos_evalution_result_list += [oos_result]
                scores_eval += [score]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()[0]]
        export_plot(scores_eval, "Scores", self.config.plot_output)

        return evaluation_result_list, oos_evalution_result_list
    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        model_a.env.state.is_render_image = model_a.config.render_train
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            flag = True
            while flag:
                state = self.env.reset() # h x w x c
                agent_location = self.env.state.agent_location
                if self.env.teacher.dist_map[agent_location[1],agent_location[0]]!=np.inf:
                    flag = False
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            h_state_fw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            h_state_bw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            state_batch = list()
            goal_state_batch = list()
            goal_obs_image_batch = list()
            path_loc = list()
            path_ori = list()
            done_batch = list()
            width, height = self.env.state.xmap.dim['width'], self.env.state.xmap.dim['height']
            side_radius = min(self.config.visible_radius_unit_side, max(width - 1, height - 1))
            block_size = self.env.state.image_block_size
            for i in range(200):
                #### teacher rotate ####
                agent_location = self.env.state.agent_location
                agent_orientation = self.env.state.agent_orientation
                goal_location = agent_location+agent_orientation
                gt_action = self.env.teacher.action_map[agent_location[1], agent_location[0]]
                if np.dot(agent_orientation, orientation_map[gt_action])!=1:
                    tmp = np.cross(agent_orientation, orientation_map[gt_action])
                    if tmp==1:
                        state, reward_i, done = self.env.step(3)
                    else:
                        state, reward_i, done = self.env.step(2)
                    continue
                path_loc.append(copy.deepcopy(goal_location))
                path_ori.append(copy.deepcopy(agent_orientation))
                raw_goal_state, goal_state = self.convert_state_to_goal_state(state)
                state_batch.append(raw_goal_state[None][None])
                goal_state_batch.append(goal_state)
                if self.config.render_train:
                    goal_obs_image_batch.append(self.env.state.image[:3*block_size, (side_radius-1)*block_size:(side_radius+2)*block_size, :])
                state, reward_i, done = self.env.step(0)
                done_batch.append(done)
                if done:
                    break

            slen = np.array([len(state_batch)]).astype('int32')
            state_batch = np.concatenate(state_batch, axis=1)
            best_action_batch, q_values_batch, h_state_fw, h_state_bw = self.get_best_action_batch(state_batch, h_state_fw, h_state_bw, slen)
            action_batch = exp_schedule.get_action_batch(best_action_batch)
            for i in range(q_values_batch.shape[0]):
                max_q_values.append(max(q_values_batch[i]))
                q_values += list(q_values_batch[i])

            reward_batch = list()
            for i, action in enumerate(action_batch):
                if action==0:
                    reward_batch.append(0)
                else:
                    if self.config.render_train:
                        model_a.env.teacher.goal_obs_image = goal_obs_image_batch[i]
                    h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                    model_a.env.teacher.set_goal(goal_state_batch[i], path_loc[i])
                    reward_a = model_a.navi_goal(h_state_a, goal_state_batch[i])
                    if model_a.env.teacher.goal_finish:
                        reward_batch.append(-0.05)
                    else:
                        reward_batch.append(-0.1)
                    #model_a.env.state.teleport(model_a.env.agent, path_loc[i], path_ori[i])
            if action_batch[-1]==1 and model_a.env.teacher.goal_finish:
                reward_batch[-1] += 1
            else:
                if self.config.render_train:
                        model_a.env.teacher.goal_obs_image = goal_obs_image_batch[-1]
                h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                model_a.env.teacher.set_goal(goal_state_batch[-1], path_loc[-1])
                reward_a = model_a.navi_goal(h_state_a, goal_state_batch[-1])
                if model_a.env.teacher.goal_finish:
                    reward_batch[-1] += 1

            for i in range(action_batch.shape[0]):
                idx = replay_buffer.store_frame(state_batch[0][i])
                replay_buffer.store_effect(idx, action_batch[i], reward_batch[i], done_batch[i])

            for i in range(action_batch.shape[0]):
                t += 1
                last_eval += 1
                last_record += 1
                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                   (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                        ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                        ("Grads", grad_eval), ("Max Q", self.max_q), 
                                        ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                        self.config.learning_start))
                    sys.stdout.flush()

            # count reward
            total_reward = sum(reward_batch)
            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Beispiel #17
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        if not self.config.batch:
            replay_buffer = ReplayBuffer(
                self.config.buffer_size, self.config.state_history
            )
        else:
            self.logger.info(
                'Loading replay buffer from {}'.format(self.config.buffer_path)
            )
            replay_buffer = ReplayBuffer.load(self.config.buffer_path)
            self.logger.info(
                'Loaded buffer with {} observations and {} in buffer'.format(
                    len(replay_buffer.obs), replay_buffer.num_in_buffer
                )
            )

        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        episode_lengths = deque(maxlen=1000)
        max_episode_length = 0
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0

            if not self.config.batch:
                state = self.env.reset()

            episode_step = 0
            avg_episode_length = (
                np.nan if len(episode_lengths) == 0 else np.mean(episode_lengths)
            )

            while True:
                t += 1
                episode_step += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train:
                    self.env.render()

                if not self.config.batch:
                    get_action = functools.partial(
                        exp_schedule.get_action,
                        episode_num=len(episode_lengths),
                        episode_step=episode_step,
                        avg_episode_length=avg_episode_length
                    )
                    state, reward, done, _q_values = self.interact(
                        replay_buffer, state, get_action
                    )
                else:
                    reward = 0
                    done = True
                    _q_values = [0]

                # store q values
                max_q_values.append(max(_q_values))
                q_values.extend(list(_q_values))

                # perform a training step
                loss_eval, grad_eval = self.train_step(
                    t, replay_buffer, lr_schedule.epsilon
                )

                # logging stuff
                learning = (t > self.config.learning_start)
                learning_and_loggging = (
                    learning and
                    (t % self.config.log_freq == 0) and
                    (t % self.config.learning_freq == 0)
                )
                if learning_and_loggging:
                    self.update_averages(
                        rewards, max_q_values, q_values,
                        scores_eval, episode_lengths, max_episode_length
                    )
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        if self.config.batch:
                            exact = [
                                ("Loss", loss_eval),
                                ("Grads", grad_eval),
                                ("lr", lr_schedule.epsilon),
                            ]
                        else:
                            exact = [
                                ("Loss", loss_eval),
                                ("Avg_R", self.avg_reward),
                                ("Max_R", np.max(rewards)),
                                ("eps", exp_schedule.epsilon),
                                ("Grads", grad_eval),
                                ("Max_Q", self.max_q),
                                ("lr", lr_schedule.epsilon),
                                ("avg_ep_len", avg_episode_length)
                            ]

                        prog.update(t + 1, exact=exact)

                elif not learning and (t % self.config.log_freq == 0):
                    sys.stdout.write(
                        "\rPopulating the memory {}/{}...".format(
                            t, self.config.learning_start
                        )
                    )
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    episode_lengths.append(episode_step)
                    if episode_step > max_episode_length:
                        max_episode_length = episode_step

                        # retrain the clusters every time the max episode
                        # length changes
                        if hasattr(self, 'reset_counts'):
                            self.reset_counts(
                                n_clusters=max_episode_length,
                                states=replay_buffer.get_encoded_states(),
                                actions=replay_buffer.get_actions()
                            )

                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            should_evaluate = (
                (t > self.config.learning_start) and
                (last_eval > self.config.eval_freq)
            )
            if should_evaluate:
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval.append(self.evaluate())

            should_record = (
                (t > self.config.learning_start) and
                self.config.record and
                (last_record > self.config.record_freq)
            )
            if should_record:
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval.append(self.evaluate())
        export_plot(scores_eval, "Scores", self.config.plot_output)

        if not self.config.batch:
            # save replay buffer
            self.logger.info(
                'Saving buffer to {}'.format(self.config.buffer_path)
            )
            replay_buffer.save(self.config.buffer_path)
    def train(self, beta_schedule, lr_schedule, cr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        self.init_averages()

        t = last_eval = curri_idx = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        curriculum_batch_size = np.ceil(
            self.config.nsteps_train /
            cr_schedule.n_curriculum).astype('int32')

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            config = self.config
            config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit = cr_schedule[
                curri_idx]
            self.env.reset(config)  # h x w x c
            h_state = DNC.zero_state(config, batch_size=1)
            encoding, predflag, target_action = self.env.prepare_seq()
            slen = np.array(encoding.shape[0]).astype('int32')
            # describe graph, query and planning
            h_state = self.sess.run(self.hs_out,
                                    feed_dict={
                                        self.s: encoding[None],
                                        self.hs: h_state,
                                        self.slen: slen
                                    })
            past_state = -1
            past_action_onehot = -1
            encoding_a = np.zeros([config.max_step_len, encoding.shape[1]])
            predflag_a = np.zeros(config.max_step_len)
            target_action_a = np.zeros(
                [config.max_step_len, target_action.shape[1]])
            for i in range(config.max_step_len):
                current_encoding = GraphWorld.convert_triplets_to_encoding(
                    np.array([[
                        past_state, self.env.current_state, past_action_onehot
                    ]]).astype('int32'), config.ndigits, config.nway)
                current_encoding = np.concatenate(
                    [current_encoding, np.array([[1, 0]])], axis=1)
                pred_action, h_state = self.sess.run(
                    [self.q, self.hs_out],
                    feed_dict={
                        self.s: current_encoding[None],
                        self.hs: h_state,
                        self.slen: np.ones(1).astype('int32')
                    })
                gt_action = self.env.get_gt_action()
                action = self.get_action(pred_action[0], gt_action,
                                         beta_schedule.epsilon)
                past_state = self.env.current_state
                _, done, past_action_onehot = self.env.step(action)
                encoding_a[i, :] = current_encoding[0]
                predflag_a[i] = 1
                target_action_a[i] = gt_action
                slen += 1
                if done:
                    break
            batch_data = (np.concatenate([encoding, encoding_a], axis=0)[None],
                          np.concatenate([predflag, predflag_a], axis=0),
                          np.concatenate([target_action, target_action_a],
                                         axis=0), slen)

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon,
                                                   batch_data)

            # logging stuff
            if ((t % config.log_freq == 0)
                    and (t % config.learning_freq == 0)):
                self.update_averages(scores_eval)
                beta_schedule.update(t)
                lr_schedule.update(t)
                prog.update(t + 1,
                            exact=[("Loss", loss_eval), ("Grads", grad_eval),
                                   ("lr", lr_schedule.epsilon)])

            if t >= config.nsteps_train:
                break

            if last_eval >= config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate(cr_schedule, curri_idx)]
                if scores_eval[-1] > 0.8:
                    curri_idx += 1
                    msg = "Upgrade to lesson {:d}".format(int(curri_idx))
                    self.logger.info(msg)
                    self.logger.info(
                        "----------Start Computing Final Score----------")
                    scores_eval += [self.evaluate(cr_schedule)]
                    self.logger.info(
                        "----------Finish Computing Final Score----------")

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(cr_schedule)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Beispiel #19
0
    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")
            prog_bar = True
        else:
            prog_bar = False

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        # replay memory to play
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = []
        
        if prog_bar:
            prog = Progbar(target=num_episodes)

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            while True:
                if self.config.render_test: env.render()

                # store last state in buffer
                idx     = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action = self.get_action(q_input)

                # perform action in env
                new_state, reward, done, info = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # count reward
                total_reward += reward
                if done:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)
            
            if prog_bar:
                prog.update(i + 1, exact=[("Reward", total_reward)])

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward
Beispiel #20
0
    def train(self, exp_schedule, lr_schedule):
        # Initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.FLAGS.buffer_size,
                                     self.FLAGS.state_hist)
        rewards = deque(maxlen=self.FLAGS.num_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = 0  # time control of nb of steps
        loss_eval = grad_eval = 0
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]

        self.prog = Progbar(target=self.FLAGS.train_steps)

        # Train for # of train steps
        while t < self.FLAGS.train_steps:
            continual_crash = 0
            try:
                total_reward = 0
                ep_len = 0
                state = self.env.reset()

                # Run for 1 episode and update the buffer
                while True:
                    ep_len += 1

                    # replay memory stuff
                    idx = replay_buffer.store_frame(state)
                    q_input = replay_buffer.encode_recent_observation()

                    # chose action according to current Q and exploration
                    best_action, q_values = self.network.get_best_action(
                        q_input)
                    action = exp_schedule.get_action(best_action)

                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)

                    # perform action in env
                    new_state, reward, done, info = self.env.step(action)

                    # store the transition
                    replay_buffer.store_effect(idx, action, reward, done)
                    state = new_state

                    # Count reward
                    total_reward += reward

                    # Stop at end of episode
                    if done: break

                #Store episodic rewards
                if ep_len > 1: rewards.append(total_reward)

                # Learn using replay
                while True:
                    t += 1
                    ep_len -= 1

                    # Make train step if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.learn_every == 0)):
                        loss_eval, grad_eval = self.network.update_step(
                            t, replay_buffer, lr_schedule.epsilon,
                            self.summary)
                        exp_schedule.update(t)
                        lr_schedule.update(t)

                    if (t % self.FLAGS.target_every == 0):
                        self.network.update_target_params()

                    # Update logs if necessary
                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.log_every == 0)
                            and (len(rewards) > 0)):
                        self.update_averages(rewards, max_q_values, q_values,
                                             scores_eval)
                        self.update_logs(t, loss_eval, rewards,
                                         exp_schedule.epsilon, grad_eval,
                                         lr_schedule.epsilon)

                    # Update logs if necessary
                    elif (t < self.FLAGS.learn_start) and (
                            t % self.FLAGS.log_every == 0):
                        sys.stdout.write(
                            "\rPopulating the memory {}/{}...".format(
                                t, self.FLAGS.learn_start))
                        sys.stdout.flush()

                    if ((t > self.FLAGS.learn_start)
                            and (t % self.FLAGS.check_every == 0)):
                        # Evaluate current model
                        scores_eval += [
                            self.evaluate(self.env, self.FLAGS.num_test)
                        ]

                        # Save current Model
                        self.network.save()

                        # Record video of current model
                        if self.FLAGS.record:
                            self.record()

                    if ep_len <= 0 or t >= self.FLAGS.train_steps: break
                continual_crash = 0

            except Exception as e:
                continual_crash += 1
                self.logger.info(e)
                if continual_crash >= 10:
                    self.logger.info("Crashed 10 times -- stopping u suck")
                    raise e
                else:
                    t -= 1
                    self.logger.info("Env crash, making new env")
                    time.sleep(60)
                    self.env = create_slither_env(self.FLAGS.state_type)
                    self.env = Unvectorize(self.env)
                    self.env.configure(fps=self.FLAGS.fps,
                                       remotes=self.FLAGS.remotes,
                                       start_timeout=15 * 60,
                                       vnc_driver='go',
                                       vnc_kwargs={
                                           'encoding': 'tight',
                                           'compress_level': 0,
                                           'fine_quality_level': 50
                                       })
                    time.sleep(60)

        # End of training
        self.logger.info("- Training done.")
        self.network.save()
        scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)]
        export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
    def train(self, exp_schedule, lr_schedule, choose_teacher_strategy=None):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        allsteps = []
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                if self.config.state_subspace is not None:
                    out_of_bounds = False
                    if self.config.state_subspace in [
                            'ball_top_half', 'ball_bottom_half'
                    ]:
                        image = self.env.unwrapped._get_obs()
                        ball_position = ball_half_screen_position(image)
                        # check if ball is in top half but we're restricted to bottom half
                        if ball_position == 1 and self.config.state_subspace == 'ball_bottom_half':
                            out_of_bounds = True
                        # check if ball is in bottom half but we're restricted to top half
                        elif ball_position == 0 and self.config.state_subspace == 'ball_top_half':
                            out_of_bounds = True
                    else:
                        raise NotImplementedError
                    if out_of_bounds:  # current state is outside of this agent's state subspace
                        # perform action in env
                        state, reward, done, info = self.env.step(action)

                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()
                # self.q_inputs.append(q_input)

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                if choose_teacher_strategy is not None:
                    # store the reward with the teacher choice strategy
                    choose_teacher_strategy.store_reward(reward, q_input)

                # perform a training step
                loss_eval, grad_eval = self.train_step(
                    t, replay_buffer, lr_schedule.epsilon,
                    choose_teacher_strategy)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if choose_teacher_strategy is not None:
                        choose_teacher_strategy.update_schedule(t)
                    if len(rewards) > 0:
                        exact = [("Loss", loss_eval),
                                 ("Avg R", self.avg_reward),
                                 ("Max R", np.max(rewards)),
                                 ("eps", exp_schedule.epsilon),
                                 ("Grads", grad_eval), ("Max Q", self.max_q),
                                 ("lr", lr_schedule.epsilon)]
                        if choose_teacher_strategy is not None and hasattr(
                                choose_teacher_strategy, 'eps_schedule'):
                            exact.append(
                                ("Choose teacher eps",
                                 choose_teacher_strategy.eps_schedule.epsilon))
                        prog.update(t + 1, exact=exact)

                elif ((t > self.config.learning_start)
                      and (t % self.config.save_teacher_choice_freq == 0)
                      and (choose_teacher_strategy is not None)):
                    choose_teacher_strategy.save(
                        self.config.teacher_choice_output_path)

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (
                    last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
        if choose_teacher_strategy is not None:
            choose_teacher_strategy.save(
                self.config.teacher_choice_output_path)
Beispiel #22
0
    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        orientation_map = [
            np.array([0, 1]),
            np.array([-1, 0]),
            np.array([0, -1]),
            np.array([1, 0])
        ]

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            flag = True
            while flag:
                state = self.env.reset()  # h x w x c
                agent_location = self.env.state.agent_location
                if self.env.teacher.dist_map[agent_location[1],
                                             agent_location[0]] != np.inf:
                    flag = False
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            h_state = (np.zeros([1, self.config.h_size]),
                       np.zeros([1, self.config.h_size]))
            h_state_a = (np.zeros([1, model_a.config.h_size]),
                         np.zeros([1, model_a.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0
            for i in range(200):
                t += 1
                last_eval += 1
                last_record += 1

                raw_goal_state, goal_state = self.convert_state_to_goal_state(
                    state)
                #### for replay_buffer
                # replay memory stuff
                idx = replay_buffer.store_frame(raw_goal_state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values, h_state = self.get_best_action(
                    [q_input], h_state, slen, [action])
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                reward = 0
                #### perform action in env ####
                #### update goal obs image ####
                if action == 1:
                    if self.config.render_train:
                        self.env.teacher.update_goal_obs_image(self.env.state)
                if self.config.render_train: self.env.render()
                #### teacher move ####
                agent_location = self.env.state.agent_location
                agent_orientation = self.env.state.agent_orientation
                goal_location = agent_location + agent_orientation
                gt_action = self.env.teacher.action_map[agent_location[1],
                                                        agent_location[0]]
                if np.dot(agent_orientation, orientation_map[gt_action]) == 1:
                    new_state, reward_i, done = self.env.step(0)
                else:
                    tmp = np.cross(agent_orientation,
                                   orientation_map[gt_action])
                    if tmp == 1:
                        new_state, reward_i, done = self.env.step(3)
                    else:
                        new_state, reward_i, done = self.env.step(2)
                #### issue command ####
                if action == 1:
                    model_a.env.teacher.set_goal(goal_state, goal_location)
                    reward_a = model_a.navi_goal(h_state_a, goal_state)
                    if model_a.env.teacher.goal_finish:
                        reward += reward_i
                    reward += reward_a
                    reward += -1
                    self.env.state.teleport(
                        self.env.agent, model_a.env.state.agent_location,
                        model_a.env.state.agent_orientation)
                    new_state = self.env.state.onehot_state

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Beispiel #23
0
    def run_epoch(self, session, train, val, log):
        num_samples = len(train["context"])
        num_batches = int(np.ceil(num_samples) * 1.0 / self.config.batch_size)
        self.result_saver.save("batch_size", self.config.batch_size)

        progress = Progbar(target=num_batches)
        best_f1 = 0
        for i, train_batch in enumerate(
                batches(train,
                        is_train=True,
                        batch_size=self.config.batch_size,
                        window_size=self.config.window_size)):
            _, loss = self.optimize(session, train_batch)
            progress.update(i, [("training loss", loss)])
            self.result_saver.save("losses", loss)

            if i % self.config.eval_num == 0 or i == num_batches:

                # Randomly get some samples from the dataset
                train_samples = get_random_samples(
                    train, self.config.samples_used_for_evaluation)
                val_samples = get_random_samples(
                    val, self.config.samples_used_for_evaluation)

                # First evaluate on the training set for not using best span
                f1_train, EM_train = self.evaluate_answer(session,
                                                          train_samples,
                                                          use_best_span=False)

                # Then evaluate on the val set
                f1_val, EM_val = self.evaluate_answer(session,
                                                      val_samples,
                                                      use_best_span=False)

                if log:
                    print()
                    print("Not using best span")
                    logging.info(
                        "F1: {}, EM: {}, for {} training samples".format(
                            f1_train, EM_train,
                            self.config.samples_used_for_evaluation))
                    logging.info(
                        "F1: {}, EM: {}, for {} validation samples".format(
                            f1_val, EM_val,
                            self.config.samples_used_for_evaluation))

                # First evaluate on the training set
                f1_train, EM_train = self.evaluate_answer(session,
                                                          train_samples,
                                                          use_best_span=True)

                # Then evaluate on the val set
                f1_val, EM_val = self.evaluate_answer(session,
                                                      val_samples,
                                                      use_best_span=True)

                if log:
                    print()
                    print("Using best span")
                    logging.info(
                        "F1: {}, EM: {}, for {} training samples".format(
                            f1_train, EM_train,
                            self.config.samples_used_for_evaluation))
                    logging.info(
                        "F1: {}, EM: {}, for {} validation samples".format(
                            f1_val, EM_val,
                            self.config.samples_used_for_evaluation))

                self.result_saver.save("f1_train", f1_train)
                self.result_saver.save("EM_train", EM_train)
                self.result_saver.save("f1_val", f1_val)
                self.result_saver.save("EM_val", EM_val)
                batches_trained = 1 if self.result_saver.is_empty("batch_indices") \
                else self.result_saver.get("batch_indices")[-1] + min(i + 1, self.config.eval_num)

                self.result_saver.save("batch_indices", batches_trained)

                save_graphs(self.result_saver.data, path=self.config.train_dir)
                if f1_val > best_f1:
                    saver = tf.train.Saver()
                    saver.save(
                        session,
                        pjoin(self.config.train_dir,
                              "BATCH-{}".format(batches_trained)))
                    best_f1 = f1_val
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history, self.config)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        scores_eval += [self.evaluate()]
        
        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()
                # replay memory stuff
                idx      = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values = self.get_best_action(q_input)
                action, explore       = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)

                # perform action in env
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done, explore)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon, exp_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                   (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values, scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                        ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                        ("Grads", grad_eval), ("Max Q", self.max_q), 
                                        ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                        self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                scores_eval += [self.evaluate()]

            if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record =0
                self.record()

        # last words
        self.logger.info("- Training done.")
        self.save()
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()  # h x w x c
            goal_state = self.env.teacher.goal_obs_onehot_state  # h x w x c
            h_state = (np.zeros([1, self.config.h_size]),
                       np.zeros([1, self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0
            for i in range(200):
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                #### for replay_buffer
                # replay memory stuff
                idx = replay_buffer.store_frame(state, goal_state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                curr_attention = np.equal(
                    np.sum(np.equal(q_input, goal_state[None][None][None]), 3),
                    q_input.shape[3])
                best_action, q_values, h_state = self.get_best_action(
                    [q_input], curr_attention[None], h_state, slen, [action])
                #best_action, q_values, h_state = self.get_best_action([q_input], goal_state[None][None], h_state, slen, [action])
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                # perform action in env
                new_state, reward, done = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)