Exemple #1
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
                Instance of LinearExploration
            lr_schedule: Schedule for learning rate
                Instance of LinearExploration
        """

        # initialize replay buffer and variables

        replay_buffer = ReplayBuffer(self.config.buffer_size, self.board_size)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_p_values = deque(maxlen=1000)
        p_values = deque(maxlen=1000)
        self.init_averages()
        t = last_eval = last_record = 0  # time control of nb of steps
        episode = last_checkpoint = 0
        opponents = []  # opponents used to play against in training

        # load model from checkpiont if necessary
        if not self.config.checkpoint == -1:
            self.saver.restore(self.sess, self.config.load_checkpoint_file)
            opponents += [self.generate_opponent(t)]
            t = self.config.checkpoint

        scores_eval = [self.evaluate(t)[0]
                       ]  # list of scores computed at iteration time

        prog = Progbar(target=self.config.nsteps_train)

        # files for writing stuff
        train_game_length_f = open("train_game_lengths.txt", 'w', buffering=1)
        eval_game_length_f = open("eval_game_lengths.txt", 'w', buffering=1)

        # per episode training loop
        while t < self.config.nsteps_train:
            # variables for this episode
            state = self.env.reset()
            states = []
            actions = []
            training_agent_is_black = random.choice(
                [True, False]
            )  # if the agent being trained is playing as black or not (playing first)
            episode += 1
            last_checkpoint += 1

            # Add opponent to pool if it's time to
            if t == 0 or last_checkpoint > self.config.checkpoint_freq:
                opponents += [self.generate_opponent(t)]
                last_checkpoint = 0

            # randomly sample an opponent for this episode
            self.opponent_out = random.sample(opponents, 1)[0]

            # If our agent should play as white, let the oponent make a move!
            # We know that the game won't end in one move, so don't worry about that!
            if not training_agent_is_black:
                # Let the opponent make a move
                player = self.env.state.color
                player_perspective_board = self._board_from_player_perspective(
                    state, player)
                best_action, _, _ = self.get_best_valid_action(
                    player_perspective_board)
                state, _, _, _ = self.env.step(best_action)

            # per action training loop
            while True:
                # increment counters
                t += 1
                last_eval += 1
                last_record += 1

                # render?
                if self.config.render_train:
                    print("Board before agent moves:")
                    self.env.render()

                # Who's turn is it?
                player = self.env.state.color

                # chose action according to current state and exploration
                player_perspective_board = self._board_from_player_perspective(
                    state, player)
                action, action_dist, valid_actions = self.sample_valid_action(
                    player_perspective_board)
                action = exp_schedule.get_action(action, valid_actions)

                # store p values
                max_p_values.append(max(action_dist))
                p_values += list(action_dist)

                # perform action in env, and remember if the player just managed to loose the game
                new_state, _, done, _ = self.env.step(action)
                training_agent_made_last_move = done

                # Render?
                if self.config.render_train:
                    print("Board after agent moves:")
                    self.env.render()

                # Store the s, a, for later use in replay buffer
                states.append(
                    self._board_from_player_perspective(state, player))
                actions.append(action)

                # if the game hasn't ended, let the opponent move
                if not done:
                    player = self.env.state.color
                    player_perspective_board = self._board_from_player_perspective(
                        new_state, player)
                    best_action, _, _ = self.get_opponent_best_valid_action(
                        player_perspective_board)
                    new_state, _, done, _ = self.env.step(best_action)

                # now if we're done, keep track of some data (compute reward + write game length to file)
                # manually compute who (should have) won using the game state

                # Manually compute the reward, it's non-zero only if the games finished
                # the open AI env's reward is unreliable because of invalid moves (the person winning 'resigns')
                # we just take the sign of the 'official score', which is positive iff white is winning
                # and adjust it to whoever 'we' are playing as
                reward = 0.0
                if done:
                    reward = np.sign(self.env.state.board.official_score)
                    if training_agent_is_black: reward *= -1.0

                # store the transition
                state = new_state

                # now that we know the true reward (after opponent taking action) we can update it
                rewards.append(reward)

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # Update schedules
                exp_schedule.update(t)
                lr_schedule.update(t)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)):
                    self.update_averages(rewards, max_p_values, p_values,
                                         scores_eval)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max P", self.max_p),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # If finished, print stuff out, and add to replay buffer
                if done:
                    # Logging (for some graphs)
                    game_length = len(states)
                    train_game_length_f.write(str(game_length) + '\n')

                    # Compute the values (discounted sum of rewards) for this game
                    backpropogated_rewards = np.array([reward] * len(states))
                    discounts = np.array(
                        list(
                            reversed([
                                self.config.gamma**i
                                for i in range(len(states))
                            ])))
                    discounted_values = backpropogated_rewards * discounts

                    # If the training agent lost the game, we want to make sure that their
                    # LOOSING move has a negative value...
                    if training_agent_made_last_move and discounted_values[
                            -1] > 0:
                        discounted_values[-1] *= -1.0

                    # Put stuff in the replay buffer
                    replay_buffer.store_example_batch(states, actions,
                                                      discounted_values)

                    # Break from the step training loop
                    break

            # If it's time to eval, then evaluate
            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                last_eval = 0
                print("")
                eval_avg_reward, eval_avg_length = self.evaluate(t)
                scores_eval += [eval_avg_reward]
                eval_game_length_f.write(str(eval_avg_length) + '\n')

        # last words
        self.logger.info("- Training done.")
        self.save()
        eval_avg_reward, eval_avg_length = self.evaluate(t)
        scores_eval += [eval_avg_reward]
        eval_game_length_f.write(str(eval_avg_length) + '\n')
def train(sess, env, Qnet, global_step):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # load model if have
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR)

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
        print("global step: ", global_step.eval())

    else:
        print("Could not find old network weights")

    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    Qnet.update_target_network()
    count_parameters()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    i = global_step.eval()

    eval_acc_reward = 0
    tic = time.time()
    eps = 1
    while True:
        i += 1
        eps = EPS_DECAY_RATE**i
        eps = max(eps, EPS_MIN)
        s = env.reset()
        # plt.imshow(s, interpolation='none')
        # plt.show()
        # s = prepro(s)
        ep_ave_max_q = 0

        if i % SAVE_STEP == 0:  # save check point every 1000 episode
            sess.run(global_step.assign(i))
            save_path = saver.save(sess,
                                   SUMMARY_DIR + "model.ckpt",
                                   global_step=global_step)
            print("Model saved in file: %s" % save_path)
            print("Successfully saved global step: ", global_step.eval())

        for j in xrange(MAX_EP_STEPS):
            predicted_q_value = Qnet.predict(
                np.reshape(s, np.hstack((1, Qnet.s_dim))))
            predicted_q_value = predicted_q_value[0]

            np.random.seed()

            action = np.argmax(predicted_q_value)
            if np.random.rand() < eps:
                action = np.random.randint(4)
                # print('eps')
            # print'actionprob:', action_prob

            # print(action)
            # print(a)

            s2, r, terminal, info = env.step(action)
            # print r, info
            # plt.imshow(s2, interpolation='none')
            # plt.show()

            # s2 = prepro(s2)

            # print(np.reshape(s, (actor.s_dim,)).shape)
            action_vector = action_ecoder(action, Qnet.a_dim)
            replay_buffer.add(np.reshape(s, (Qnet.s_dim)), np.reshape(action_vector, (Qnet.a_dim)), r, \
                terminal, np.reshape(s2, (Qnet.s_dim)))

            s = s2
            eval_acc_reward += r

            if terminal:
                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > MINIBATCH_SIZE:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # Calculate targets
                    target_q = Qnet.predict_target(s2_batch)
                    y_i = []
                    for k in xrange(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] +
                                       GAMMA * np.max(target_q[k]))

                    # # Update the Qnet given the target
                    predicted_q_value, _ = Qnet.train(s_batch, a_batch, y_i)

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient

                    # Update target networks every 1000 iter
                    # if i%TARGET_UPDATE_STEP == 0:
                    Qnet.update_target_network()

                if i % EVAL_EPISODES == 0:
                    # summary
                    time_gap = time.time() - tic
                    summary_str = sess.run(
                        summary_ops,
                        feed_dict={
                            summary_vars[0]:
                            (eval_acc_reward + EVAL_EPISODES) / 2,
                            summary_vars[1]: ep_ave_max_q / float(j + 1),
                        })
                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \
                        '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps))
                    tic = time.time()

                    # print(' 100 round reward: ', eval_acc_reward)
                    eval_acc_reward = 0

                break
Exemple #3
0
def train(sess, env, env_test, args, agent):
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()
    episode_R = []

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    agent.update_actor_target_network()
    agent.update_critic_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    total_step_cnt = 0
    test_iter = 0
    epi_cnt = 0
    return_test = np.zeros(
        (np.ceil(int(args['total_step_num']) /
                 int(args['sample_step_num'])).astype('int') + 1))

    result_name = 'TD3_' + args['env'] + '_trial_idx_' + str(
        int(args['trial_idx']))
    action_noise = float(args['action_noise'])
    trained_times_steps = 0
    save_cnt = 1
    policy_ite = 0

    #for i in range(int(args['max_episodes'])):
    while total_step_cnt in range(int(args['total_step_num'])):

        state = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
        T_end = False

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            if total_step_cnt < 1e4:
                action = env.action_space.sample()
            else:
                action = agent.predict_actor(
                    np.reshape(state, (1, agent.state_dim)))  #+ actor_noise()
                clipped_noise = np.clip(
                    np.random.normal(0,
                                     action_noise,
                                     size=env.action_space.shape[0]), -0.5,
                    0.5)
                action = (action + clipped_noise).clip(env.action_space.low,
                                                       env.action_space.high)

            state2, reward, terminal, info = env.step(action[0])

            replay_buffer.add(np.reshape(state, (agent.state_dim, )),
                              np.reshape(action, (agent.action_dim, )),
                              reward, terminal,
                              np.reshape(state2, (agent.state_dim, )))

            if j == int(args['max_episode_len']) - 1:
                T_end = True

            state = state2
            ep_reward += reward
            total_step_cnt += 1

            if total_step_cnt >= test_iter * int(
                    args['sample_step_num']) or total_step_cnt == 1:
                print('total_step_cnt', total_step_cnt)
                print('evaluating the deterministic policy...')
                for nn in range(int(args['test_num'])):
                    state_test = env_test.reset()
                    return_epi_test = 0
                    for t_test in range(int(args['max_episode_len'])):
                        action_test = agent.predict_actor(
                            np.reshape(state_test, (1, agent.state_dim)))
                        state_test2, reward_test, terminal_test, info_test = env_test.step(
                            action_test[0])
                        state_test = state_test2
                        return_epi_test = return_epi_test + reward_test
                        if terminal_test:
                            break

                    print('test_iter:{:d}, nn:{:d}, return_epi_test: {:d}'.
                          format(int(test_iter), int(nn),
                                 int(return_epi_test)))
                    return_test[test_iter] = return_test[
                        test_iter] + return_epi_test / float(args['test_num'])

                print('return_test[{:d}] {:d}'.format(
                    int(test_iter), int(return_test[test_iter])))
                test_iter += 1

            if total_step_cnt > int(args['save_model_num']) * save_cnt:
                model_path = "./Model/"
                try:
                    import pathlib
                    pathlib.Path(model_path).mkdir(parents=True, exist_ok=True)
                except:
                    print(
                        "A model directory does not exist and cannot be created. The policy models are not saved"
                    )

                agent.save_model(iteration=test_iter,
                                 expname=result_name,
                                 model_path=model_path)
                save_cnt += 1

            if terminal or T_end:
                epi_cnt += 1

                print(
                    '| Reward: {:d} | Episode: {:d} | Total step num: {:d} |'.
                    format(int(ep_reward), epi_cnt, total_step_cnt))
                # episode_R.append(ep_reward)
                break

        if total_step_cnt != int(
                args['total_step_num']) and total_step_cnt > 1e3:
            update_num = total_step_cnt - trained_times_steps
            trained_times_steps = total_step_cnt
            print('update_num', update_num)
            update_policy(sess, env, env_test, args, agent, replay_buffer,
                          action_noise, update_num)
            policy_ite += 1

    return return_test
Exemple #4
0
def main_loop(handle, possible_actions: list, model: Model,
              target_model: Model):
    exp_schedule = ExplorationScheduler()
    target_model.load_state_dict(model.state_dict())
    optimizer = torch.optim.RMSprop(model.parameters())
    with mss() as sct:
        counter = 0
        frame_counter = 0
        frame_skip_counter = 0
        score = 0
        lives = 3
        frame_times = [0, 0, 0, 0]
        replay_buffer = ReplayBuffer(
            REPLAY_BUFFER_SIZE, (3 * FRAMES_FEED, RESIZE_HEIGHT, RESIZE_WIDTH),
            FRAMES_FEED,
            baseline_priority=1,
            gamma=GAMMA,
            reward_steps=N_STEP_REWARD)
        t = 0
        action = 0
        while True:
            if not active:
                time.sleep(
                    0.5
                )  # Wait some time and check if recording should be resumed.
                continue

            startMillis = time.time()  # Time

            # Grab frames
            frame, frame_cv2 = grab_screen(monitor, sct)

            # Show frame
            if DEBUG:
                cv2.imshow('window1', frame_cv2)
            # Check if frame will be skipped. Not skipped if counter is 0
            if frame_skip_counter == 0:
                reward, score, lives = get_reward(handle, lives, score)

                # print(action, reward)
                if replay_buffer.waiting_for_effect:
                    replay_buffer.add_effects(action, reward)
                replay_buffer.push_frame(frame)
                if replay_buffer.buffer_init(
                ) and np.random.random() > exp_schedule.value(t):
                    action = choose_action(replay_buffer.encode_last_frame(),
                                           model)
                else:
                    action = np.random.randint(0, len(possible_actions))

                execute_actions([possible_actions[int(action)]
                                 ]),  # dk.SCANCODES["z"]

                # Logic to deal with a ready datapoint
                if replay_buffer.can_sample(
                        BATCH_SIZE) and t % TRAIN_FREQ == 0:
                    if PAUSE_ON_TRAIN:
                        pause_game()
                    for _ in range(BATCHES_PER_TRAIN):
                        optimize_model(model,
                                       target_model,
                                       replay_buffer,
                                       optimizer,
                                       num_actions=len(possible_actions))
                    if PAUSE_ON_TRAIN:
                        pause_game()

                # Copy model weights to target
                if t % TARGET_MODEL_UPDATE_FREQ == 0:
                    print("Saving model")
                    state_dict = model.state_dict()
                    torch.save(state_dict, MODEL_PATH)
                    print("done pickling")
                    target_model.load_state_dict(state_dict)
                    target_model.eval()

            frame_skip_counter += 1
            frame_skip_counter = frame_skip_counter % FRAMES_SKIP

            # Frame timings and other utility
            endMillis = time.time()
            frame_time = endMillis - startMillis
            frame_times[counter % 4] = frame_time
            t += 1
            # if counter % 4 == 0:
            #    print("frame time: %s" % (np.mean(frame_times)))
            counter += 1
            if cv2.waitKey(25) & 0xFF == ord('q'):
                cv2.destroyAllWindows()
                break
Exemple #5
0
def train(sess, env, actor, global_step):
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # load model if have
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR)
    
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print ("Successfully loaded:", checkpoint.model_checkpoint_path)
        print("global step: ", global_step.eval())

    else:
        print ("Could not find old network weights")

    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    i = global_step.eval()


    eval_acc_reward = 0
    tic = time.time()
    eps = 1

    while True:
        i += 1
        s = env.reset()
        ep_ave_max_q = 0
        eps *= EPS_DECAY_RATE
        eps = max(eps, EPS_MIN)

        episode_s, episode_acts, episode_rewards = [], [], []

        if i % SAVE_STEP == 0 : # save check point every 1000 episode
            sess.run(global_step.assign(i))
            save_path = saver.save(sess, SUMMARY_DIR + "model.ckpt" , global_step = global_step)
            print("Model saved in file: %s" % save_path)
            print("Successfully saved global step: ", global_step.eval())


        for j in xrange(MAX_EP_STEPS):

            # print(s.shape)

            # Added exploration noise

            action = actor.predict(np.reshape(s, np.hstack((1, actor.s_dim))))
            # print action

            s2, r, terminal, info = env.step(action)
            # plt.imshow(s2, interpolation='none')
            # plt.show()
            episode_s.append(s)
            episode_acts.append(action)
            episode_rewards.append(r)

            s = s2
            eval_acc_reward += r

            if terminal:
                # stack together all inputs, hidden states, action gradients, and rewards for this episode
                episode_rewards = np.asarray(episode_rewards)
                # print('episode_rewards', episode_rewards)

                episode_rewards = discount_rewards(episode_rewards)
                # print('after', episode_rewards)
                # update buffer
                for n in range(len(episode_rewards)):
                    replay_buffer.add(np.reshape(episode_s[n], (actor.s_dim)), episode_acts[n],
                     episode_rewards[n], terminal, np.reshape(episode_s[n], (actor.s_dim)))
                
                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > MINIBATCH_SIZE:     
                    s_batch, a_batch, r_batch, t_batch, _ = replay_buffer.sample_batch(MINIBATCH_SIZE)
                    # Update the actor policy using the sampled gradient
                    actor.train(s_batch, a_batch, r_batch)



                # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                #     '| Qmax: %.4f' % (ep_ave_max_q / float(j+1))

                if i%EVAL_EPISODES == 0:
                    # summary
                    time_gap = time.time() - tic
                    summary_str = sess.run(summary_ops, feed_dict={
                        summary_vars[0]: (eval_acc_reward+EVAL_EPISODES)/2,
                    })
                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \
                         ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps))
                    tic = time.time()

                    # print(' 100 round reward: ', eval_acc_reward)
                    eval_acc_reward = 0

                break
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    #    target_network_update_freq=500
    target_network_update_freq = 1
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,1)
    deicticShape = (3, 3, 2)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        convs=[(16, 3, 1)],
        #        convs=[(16,2,1)],
        hiddens=[16],
        dueling=True)

    # MLP version
    #    model = models.mlp([8, 16])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([16, 32])
    #    model = models.mlp([16, 16])
    #    model = models.mlp([32, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)

#        # MLP version
#        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    getqTarget = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                            q_func=q_func,
                            num_actions=num_actions,
                            num_cascade=num_cascade,
                            scope="deepq",
                            qscope="q_func_target")

    update_target = build_update_target(scope="deepq",
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func")

    getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        #        obsDeictic = getDeicticObs(obs)
        obsDeictic = getDeic([obs])

        # CNN version
        qCurr = getq(np.array(obsDeictic))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))
        selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: CNN version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade])

            qCurrTargets = np.copy(qCurr)

            #            # Copy into cascade without pruning
            #            for i in range(num_cascade):
            #                qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]

            # CNN version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)


#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
#                    qCurrTargets
#                    )

# Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemple #7
0
from networks import ActorNetwork, CriticNetwork
from replay_buffer import ReplayBuffer

MINIBATCH_SIZE = 64
GAMMA = 0.99

if __name__ == '__main__':
    env = gym.make('Pendulum-v0')
    max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound = env.action_space.high[0]

    actor = ActorNetwork(state_dim, action_dim, action_bound)
    critic = CriticNetwork(state_dim, action_dim)
    replay_buffer = ReplayBuffer(10000)

    total = 0
    for episode in range(1000):
        obs0 = env.reset()
        ep_reward = 0
        for t in range(max_steps):
            if episode % 25 == 0:
                env.render()
            action = actor.act(obs0)  # TODO add noise for exploration
            obs1, reward, done, info = env.step(action)
            replay_buffer.add(obs0.reshape(state_dim),
                              action.reshape(action_dim), reward, t,
                              obs1.reshape(state_dim))

            if replay_buffer.size() > MINIBATCH_SIZE:
Exemple #8
0
from copy import deepcopy
from random import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

algo_name = 'DDQN'  # Used for visualization
max_episodes = 2000  # after 40 episodes, SAC flatlines around [-200, -100] reward
max_steps = 1000  # auto-terminate episode after win

gamma = 0.99  # Discount
α = 0.1  # Entropy temperature -- relative importance vs. rewards
lr = 3e-4  # Determines how big of a gradient step to take when optimizing
tau = 0.995  # Target smoothing coefficient --> how much of the old network(s) to keep
ε = 0.01  # Random exploration factor in training

env = gym.make('LunarLander-v2')
replay_buffer = ReplayBuffer(1e6)
batch_size = 128

q1 = Q(env)
q1_target = deepcopy(q1)
q1_optim = torch.optim.Adam(q1.parameters(), lr=lr)

q2 = Q(env)
q2_target = deepcopy(q2)
q2_optim = torch.optim.Adam(q2.parameters(), lr=lr)


def train():
    explore(10000)  # Explore the environment by taking random actions
    episode = 0
    while episode < max_episodes:  # // rougly begin algorithm from SAC+ERE
Exemple #9
0
def train(
        args,
        log_dir,
        seed,
        env_id,
        replay_buffer_len,
        memory_len,
        cores,
        trees,
        p,  # #nn items; reported number is 50
        embed_size,  # embedding vector length; reported number is ?
        gamma,  # discount value; reported number is 0.99
        N,  # N-step bootstrapping; reported number is 100
        update_period,  # the reported number is 16//4 = 4
        batch_size,  # the reported number is 32
        init_eps,
        delta,
        lr,
        q_lr,
        epsilon,
        min_epsilon,
        epsilon_decay,  #exponential decaying factor
        eval_period,
        save_period,
        **kwargs):
    # another hyper params
    _gw = np.array([gamma**i for i in range(N)])

    # expr setting
    Path(log_dir).mkdir(parents=True, exist_ok='temp' in log_dir)

    with open(os.path.join(log_dir, 'args.txt'), 'w') as f:
        f.write(str(args))

    np.random.seed(seed)
    tf.random.set_random_seed(seed)

    # Env
    env = wrap_deepmind(make_atari(env_id),
                        episode_life=False,
                        clip_rewards=False,
                        frame_stack=True,
                        scale=False)
    num_ac = env.action_space.n

    # ReplayBuffer
    replay_buffer = ReplayBuffer(replay_buffer_len)

    # Neural Episodic Controller
    nec = NEC(
        num_ac,
        p,
        embed_size,
        delta,
        lr,
        q_lr,
        dnd_params={
            'maxlen': memory_len,
            'seed': seed,
            'cores': cores,  # #cores for KD-Tree
            'trees': trees,  # #trees for KD-Tree
        })

    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(os.path.join(
        log_dir, 'tensorboard'))

    def _write_scalar(it, it_type, tag, value):
        summary = tf.Summary(value=[
            tf.Summary.Value(tag=f"{tag}/{it_type}", simple_value=value)
        ])
        summary_writer.add_summary(summary, global_step=it)

    ####### Setup Done

    num_steps = 0
    num_updates = 0

    # Fill up the memory and replay buffer with a random policy
    for ep in range(init_eps):
        ob = env.reset()

        obs, acs, rewards = [ob], [], []
        for _ in itertools.count():
            ac = np.random.randint(num_ac)

            ob, r, done, _ = env.step(ac)

            obs.append(ob)
            acs.append(ac)
            rewards.append(r)

            num_steps += 1

            if done:
                break

        Rs = [
            np.sum(_gw[:len(rewards[i:i + N])] * rewards[i:i + N])
            for i in range(len(rewards))
        ]

        obs = np.array(obs)
        es = nec._embed(obs)

        for ob, e, a, R in zip(obs, es, acs, Rs):
            nec.append(e, a, R)

            replay_buffer.append(ob, a, R)

    # Training!
    next_save_steps = save_period
    try:
        for ep in itertools.count(start=init_eps):
            ob = env.reset()

            obs, acs, rewards, es, Vs = [ob], [], [], [], []
            for t in itertools.count():
                # Epsilon Greedy Policy
                ac, (e, V) = nec.policy(ob)
                if np.random.random() < epsilon:
                    ac = np.random.randint(num_ac)

                ob, r, done, _ = env.step(ac)

                obs.append(ob)
                acs.append(ac)
                rewards.append(r)
                es.append(e)
                Vs.append(V)

                num_steps += 1

                # Train on random minibatch from replacy buffer
                if num_steps % update_period == 0:
                    b_s, b_a, b_R = replay_buffer.sample(batch_size)
                    loss = nec.update(b_s, b_a, b_R)

                    num_updates += 1

                    if num_updates % 100 == 0:
                        print(f'[{num_steps*4}/{num_updates}] loss: {loss}')

                    _write_scalar(it=num_steps * 4,
                                  it_type='per_frames',
                                  tag='loss',
                                  value=loss)
                    _write_scalar(it=num_updates,
                                  it_type='per_updates',
                                  tag='loss',
                                  value=loss)
                    _write_scalar(it=num_steps * 4,
                                  it_type='per_frames',
                                  tag='num_updates',
                                  value=num_updates)

                if t >= N:
                    # N-Step Bootstrapping
                    # TODO: implement the efficient version
                    R = np.sum(
                        _gw * rewards[t - N:t]) + (gamma**N) * Vs[t]  #R_{t-N}

                    # append to memory
                    nec.append(es[t - N], acs[t - N], R)

                    # append to replay buffer
                    replay_buffer.append(obs[t - N], acs[t - N], R)

                if done:
                    break

            print(
                f'Episode {ep} -- Ep Len: {len(obs)} Acc Reward: {np.sum(rewards)} current epsilon: {epsilon}'
            )
            _write_scalar(tag='ep',
                          value=ep,
                          it=num_steps * 4,
                          it_type='per_frames')
            _write_scalar(tag='ep_len',
                          value=len(obs),
                          it=num_steps * 4,
                          it_type='per_frames')
            _write_scalar(tag='ep_len',
                          value=len(obs),
                          it=ep,
                          it_type='per_episode')
            _write_scalar(tag='eps_reward',
                          value=np.sum(rewards),
                          it=num_steps * 4,
                          it_type='per_frames')
            _write_scalar(tag='eps_reward',
                          value=np.sum(rewards),
                          it=ep,
                          it_type='per_episode')
            _write_scalar(tag='epsilon',
                          value=epsilon,
                          it=ep,
                          it_type='per_episode')

            # Remaining items which is not bootstrappable; partial trajectory close to end of episode
            # Append to memory & replay buffer
            for t in range(len(rewards) - N, len(rewards)):
                R = np.sum([
                    gamma**(i - t) * rewards[i]
                    for i in range(t, len(rewards))
                ])
                nec.append(es[t], acs[t], R)
                replay_buffer.append(obs[t], acs[t], R)

            # epsilon decay
            epsilon = max(min_epsilon, epsilon * epsilon_decay)

            # Save Model & Evaluatate
            if ep % eval_period == 0:
                try:
                    ep_len, eps_reward = _run(env,
                                              nec,
                                              os.path.join(
                                                  log_dir, f'test-{ep}.mp4'),
                                              maxlen=len(obs) * 3)

                    print(
                        f'Evaluation -- Episode {ep} -- Ep Len: {ep_len} Acc Reward: {eps_reward}'
                    )
                    _write_scalar(tag='ep_len',
                                  value=ep_len,
                                  it=ep,
                                  it_type='per_episode_eval')
                    _write_scalar(tag='eps_reward',
                                  value=eps_reward,
                                  it=ep,
                                  it_type='per_episode_eval')
                except RuntimeError as e:
                    print(e)
                    print('Evaluation -- Skipped')

            if num_steps >= next_save_steps:
                nec.save(log_dir, it=next_save_steps *
                         4)  # iteration number -- num frames
                next_save_steps += save_period

    except KeyboardInterrupt:
        print('saving... please wait...')
        nec.save(log_dir)
        print('done!')
def train(sess, env, actor, critic, global_step):
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # load model if have
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("./results")

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
        print("global step: ", global_step.eval())

    else:
        print("Could not find old network weights")

    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    i = global_step.eval()

    eval_acc_reward = 0
    tic = time.time()
    eps = 1

    while True:
        i += 1
        s = env.reset()
        ep_ave_max_q = 0
        eps *= EPS_DECAY_RATE
        eps = max(eps, EPS_MIN)

        if i % SAVE_STEP == 0:  # save check point every 1000 episode
            sess.run(global_step.assign(i))
            save_path = saver.save(sess,
                                   "./results/model.ckpt",
                                   global_step=global_step)
            print("Model saved in file: %s" % save_path)
            print("Successfully saved global step: ", global_step.eval())

        for j in xrange(MAX_EP_STEPS):

            # print(s.shape)

            # Added exploration noise

            a = actor.predict(np.reshape(s, np.hstack((1, actor.s_dim))))
            action_score = a[0]
            probs = np.exp(action_score - np.max(action_score))
            # probs = np.exp(action_score)
            probs /= np.sum(probs)
            # np.random.seed()
            # epsilon = 0.5 # eps greedy
            # dice = np.random.uniform() # roll the dice!
            # if dice < epsilon:
            # action = np.argmax(probs)
            # else:
            action = np.random.choice(4, 1, p=probs)
            # print action
            s2, r, terminal, info = env.step(action)
            plt.imshow(s2, interpolation='none')
            plt.show()
            replay_buffer.add(np.reshape(s, (actor.s_dim)), np.reshape(a, (actor.a_dim)), r, \
                terminal, np.reshape(s2, (actor.s_dim)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                # print(s_batch.shape)
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            eval_acc_reward += r

            if terminal:

                # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                #     '| Qmax: %.4f' % (ep_ave_max_q / float(j+1))

                if i % EVAL_EPISODES == 0:
                    # summary
                    time_gap = time.time() - tic
                    summary_str = sess.run(
                        summary_ops,
                        feed_dict={
                            summary_vars[0]:
                            (eval_acc_reward + EVAL_EPISODES) / 2,
                            summary_vars[1]: ep_ave_max_q / float(j + 1),
                        })
                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \
                        '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps))
                    tic = time.time()

                    # print(' 100 round reward: ', eval_acc_reward)
                    eval_acc_reward = 0

                break
Exemple #11
0
def train(sess, env, network):
    arr_reward = np.zeros(MAX_EPISODES)
    arr_qmax = np.zeros(MAX_EPISODES)

    actor = Actor(sess, network, ACTOR_LEARNING_RATE)
    actor_target = ActorTarget(sess, network, TAU)
    critic = Critic(sess, network, CRITIC_LEARNING_RATE)
    critic_target = CriticTarget(sess, network, TAU)
    
    s_dim, a_dim, _ = network.get_const()

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    actor_target.train()
    critic_target.train()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in range(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            a = actor.predict(np.reshape(s, (1, s_dim))) + (1. / (1. + i))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (s_dim,)), np.reshape(a, (a_dim,)), r,
                              terminal, np.reshape(s2, (s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic_target.predict(s2_batch, actor_target.predict(s2_batch))

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                #ep_ave_max_q += np.amax(predicted_q_value)
                ep_ave_max_q += np.mean(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor_target.train()
                critic_target.train()

            s = s2
            ep_reward += r

            if terminal:

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j)
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('Reward: ' + str(ep_reward) + ',   Episode: ' + str(i) + ',    Qmax: ' +  str(ep_ave_max_q / float(j)))
                arr_reward[i] = ep_reward
                arr_qmax[i] = ep_ave_max_q / float(j)
                
                if i % 100 == 99:
                    np.savez(RESULTS_FILE, arr_reward[0:i], arr_qmax[0:i])

                break
Exemple #12
0
    def train_rollout(self, args, reward_result):
        # Set up summary Ops
        summary_ops, summary_vars = build_summaries()

        # Get dynamics and initialize prior controller
        prior = BasePrior()

        # Initialize target network weights
        self.actor.update_target_network()
        self.critic.update_target_network()

        # Initialize replay memory
        replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                     int(args['random_seed']))

        # Needed to enable BatchNorm.
        tflearn.is_training(True)

        paths = list()

        lambda_store = np.zeros((int(args['max_episode_len']), 1))

        for i in range(int(args['max_episodes'])):

            s = self.env.reset_inc()

            ep_reward = 0.
            ep_ave_max_q = 0

            obs, action, act_prior, rewards, obs_ref, prior_ref, collisions = [], [], [], [], [], [], []

            #Get reward using baseline controller
            s0 = np.copy(s)
            ep_reward_opt = 0.
            for kk in range(int(args['max_episode_len'])):
                a = self.env.getPrior()
                prior_ref.append(np.array([a]))
                s0, r, stop_c, act = self.env.step(a)
                ep_reward_opt += r
                obs_ref.append(s0)
                if (stop_c):
                    break

            # Get reward using regRL algorithm
            s = self.env.reset()

            for j in range(int(args['max_episode_len'])):

                # Set control prior regularization weight
                lambda_mix = 15.
                lambda_store[j] = lambda_mix

                # Get control prior
                a_prior = self.env.getPrior()

                # Rl control with exploration noise
                ab = self.actor.predict(np.reshape(
                    s, (1, self.actor.s_dim))) + self.actor_noise()

                # Mix the actions (RL controller + control prior)
                act = ab[0] / (1 + lambda_mix) + (lambda_mix /
                                                  (1 + lambda_mix)) * a_prior

                # Take action and observe next state/reward
                s2, r, terminal, act = self.env.step(act)
                collisions.append(self.env.collision_flag)
                act = np.array(act, ndmin=1)

                # Add info from time step to the replay buffer
                replay_buffer.add(np.reshape(s, (self.actor.s_dim, )),
                                  np.reshape(ab, (self.actor.a_dim, )),
                                  r, terminal,
                                  np.reshape(s2, (self.actor.s_dim, )))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > int(args['minibatch_size']):

                    #Sample a batch from the replay buffer
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        int(args['minibatch_size']))

                    # Calculate targets
                    target_q = self.critic.predict_target(
                        s2_batch, self.actor.predict_target(s2_batch))
                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] +
                                       self.critic.gamma * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = self.critic.train(
                        s_batch, a_batch,
                        np.reshape(y_i, (int(args['minibatch_size']), 1)))
                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = self.actor.predict(s_batch)
                    grads = self.critic.action_gradients(s_batch, a_outs)
                    self.actor.train(s_batch, grads[0])
                    # Update target networks
                    self.actor.update_target_network()
                    self.critic.update_target_network()

                s = s2
                ep_reward += r

                obs.append(s)
                rewards.append(r)
                action.append(act)
                act_prior.append(np.array([a_prior]))

                # Collect results at end of episode
                if terminal:
                    print(
                        '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(
                            int(ep_reward - ep_reward_opt), i,
                            (ep_ave_max_q / float(j))))
                    reward_result[0, i] = ep_reward
                    reward_result[1, i] = ep_reward_opt
                    reward_result[2, i] = np.mean(lambda_store)
                    reward_result[3, i] = max(collisions)
                    path = {
                        "Observation": np.concatenate(obs).reshape((-1, 6)),
                        "Observation_ref": np.concatenate(obs_ref).reshape(
                            (-1, 6)),
                        "Action": np.concatenate(action),
                        "Action_Prior": np.concatenate(act_prior),
                        "Action_Prior_Ref": np.concatenate(prior_ref),
                        "Reward": np.asarray(rewards)
                    }
                    paths.append(path)

                    break

        return [summary_ops, summary_vars, paths, reward_result]
Exemple #13
0
def main(args):
    file_name = 'td3_lunalander_v2'

    writer = SummaryWriter(log_dir="logs/{}_{}".format(file_name, 'numeric'))

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    env = gym.make('LunarLanderContinuous-v2')
    env.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    max_action = float(env.action_space.high[0])
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    replay_buffer = ReplayBuffer(int(1e6), [state_dim], action_dim)

    policy = TD3(env, state_dim, action_dim, max_action)

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(args.max_timesteps)):

        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < args.start_timesteps:
            action = env.action_space.sample()
        else:
            obs_tens = torch.from_numpy(state).float().reshape(1,
                                                               -1).to(device)
            # action = np.clip(policy.select_action(obs_tens) + np.random.normal(0, max_action * args.expl_noise, size=3), -max_action, max_action)
            # action = np.clip(policy.select_action(obs_tens.to(device)) + np.random.normal(0, max_action * args.expl_noise), -max_action, max_action)
            action = policy.select_action(obs_tens.to(device))

        if episode_num > int(1000):
            env.render()
        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(
            done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.store_transition(state, action, reward, next_state,
                                       done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= args.start_timesteps:
            policy.train(replay_buffer, args.batch_size)

        if done:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            writer.add_scalar("reward", episode_reward, episode_num + 1)
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        if t % 100 == 0:
            policy.save(file_name)
env = wrappers.NormalizeActions(env)
env = wrappers.MinimumDuration(env, len_time)
env = wrappers.MaximumDuration(env, len_time)
env = wrappers.ObservationDict(env, key='observation')
env = wrappers.PixelObservations(env, image_res, np.uint8, 'image')
env = wrappers.ConvertRewardToCost(env)
env = wrappers.ConvertTo32Bit(env)

# SEED EXPERIMENT TO CREATE REPRODUCIBLE RESULTS
seed_value = 0
seed_experiment(seed=seed_value)
env.seed(seed=seed_value)

# GET ENVIRONMENT DATA SHAPES
observation_shape = env.observation_space['image'].shape
action_shape = env.action_space.shape
state_shape = env.state_space.shape

# INITIALIZE INFRASTRUCTURE
logger = Logger('.')
replay_buffer = ReplayBuffer(observation_shape, action_shape, state_shape,
                             max_num_episodes, len_time)
driver = Driver(env, replay_buffer=replay_buffer)

# GATHER EXPERIENCE
print('Generating dataset. Generate %d episodes of length %d.' %
      (max_num_episodes, len_time + 1))
driver.run(render=True, num_steps=max_num_episodes * len_time, logger=logger)

# SAVE DATASET
replay_buffer.save_buffer('.', name_dataset='dataset')
def train(sess, env, args, actor, critic, actor_noise):

    # Set up summary operations
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)

    for i in range(int(args['max_episodes'])):

        # Reset the environment, initial action 0, and initialize the action list for observability during analysis
        s = env.reset()
        actor_noise.reset()
        a = 0
        a_list = []

        # Evaluation Period
        eval_time = 999

        # Episode reward and episode average max Q initializations
        ep_reward = 0
        ep_ave_max_q = 0

        # Initialize zero mean and st_dev.  Will be corrected before
        mean = env.xs
        st_dev = [0.8, 15, 0.7]

        if i % 50 == 0 and i != 0:
            print("Evaluation Episode")

        for j in range(1, int(args['max_episode_len']) + 1):

            # Take action every "sampling time" time steps to ensure steady state is reached
            if j % int(args['sampling_time']) == 0:
                # Normalize the states by subtracting the mean and dividing by the variance
                s -= mean
                s /= st_dev

                # Adding Ornstein-Ulhenbeck exploration noise to the action
                if i % 50 == 0 and i != 0:
                    # Every 50th episode, the action will have no noise to evaluate performance.
                    a = actor.predict(np.reshape(s, (1, actor.s_dim)))

                    if i == (args['max_episodes'] - 1):
                        a_list.append(a)

                else:
                    noise = actor_noise()
                    a = actor.predict(np.reshape(s, (1, actor.s_dim))) + noise

                    if i == (args['max_episode_len'] - 1):
                        a_list.append(a - noise)

                # Take the action
                env.u[j, 0] = env.u[j - 1, 0] + a[0]

                # Define evaluation time for feedback
                eval_time = j + int(args['sampling_time']) - 1

            else:
                # If it is not the sampling time, keep input constant
                env.u[j, 0] = env.u[j - 1, 0]

            # Simulate the next step
            env.x[j, :] = env.cstr_sim.sim(env.x[j - 1, :], env.u[j, :])

            # Determines if its the end of the current episode.  If the input is very far from ideal, episode ends.
            if j == env.Nsim or env.u[j, 0] < 150 or env.u[j, 0] > 450:
                terminal = True
            else:
                terminal = False

            # Feedback for RL
            if j == eval_time:

                # Ensure feedback is evaluated correctly
                assert ((j + 1) % int(args['sampling_time']) == 0)

                # Reward for RL
                r = env.reward_function(j, a[0][0])

                # Next state for RL
                s2 = deepcopy(env.x[j, :])

                # Add the latest states, action, reward, terminal, and new state to the replay memory
                replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                                  np.reshape(a, (actor.a_dim, )), r, terminal,
                                  np.reshape(s2, (actor.s_dim, )))

                # Update the new state to be the current state
                s = s2
                # Add the step's reward towards the whole episodes' reward
                ep_reward += r

            # Keep adding experience to the memory until there are at least mini-batch size samples
            # Batch Training area
            if replay_buffer.size() > int(args['minibatch_size'] * 5):

                # mini-batch size
                mini_batch_size = np.power(i, 1 / 3) * int(
                    args['minibatch_size'])

                # Obtain a batch of data from replay buffer
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    int(mini_batch_size))

                # Calculate critic target Q-value, feeding in the actor target action
                # States is the s2 from the replay buffer
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                # Calculate the Q values
                y_i = []
                for k in range(int(mini_batch_size)):
                    # Terminal state, Q = r because there is no additional trajectory beyond this point
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    # If state is not terminal, Q = r + gamma * argmax-a * Q(s', a)
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])
                """
                Update the critic given the targets
                Exact algorithm:  critic.train() returns predicted_q_value, optimize.
                Optimize takes MSE of y_i and predicted q value out.  Then does Adam Gradient Descent updating the
                critic network.
                """

                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i,
                                                 (int(mini_batch_size), 1)))

                # Output is 64 dimen predicted_q_value, then find the max of them.
                ep_ave_max_q += np.amax(predicted_q_value)
                """
                Update the actor policy using the sampled gradient
                """

                # Scaled output action given the s_batch states.
                a_outs = actor.predict(s_batch)

                # Inputs the states, and the actions given those states.
                # Forms symbolic function of the gradients as a function of the action
                grads = critic.action_gradients(s_batch, a_outs)

                # Updates actors given the gradients
                actor.train(s_batch, grads[0])

                # Update target networks by tau
                actor.update_target_network()
                critic.update_target_network()

            if terminal:
                # Update the summary ops
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ep_reward,
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j)
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(
                    int(ep_reward), i, (ep_ave_max_q / float(j))))
                break

    return replay_buffer, a_list
def main():

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)
#        return U.BatchInput(env.observation_space.shape, name)

#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
    env = gym.make("MountainCar-v0")
#    model = models.mlp([32])
    model = models.mlp([64])
#    model = models.mlp([16, 16])

    # parameters
    q_func=model
    lr=1e-3
    max_timesteps=100000
#    max_timesteps=10000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.0
    target_network_update_freq=500
#    prioritized_replay=False
    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16

#    # try mountaincar w/ different input dimensions
#    inputDims = [50,2]
    
    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess
            
    plt.plot(episode_rewards)
    plt.show()
       
    sess
Exemple #17
0
def main():

    #    env = gym.make("CartPoleRob-v0")
    #    env = gym.make("CartPole-v0")
    #    env = gym.make("CartPole-v1")
    #    env = gym.make("Acrobot-v1")
    #    env = gym.make("MountainCarRob-v0")
    #    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")
    #    env = gym.make("FrozenLake8x8rob-v0")
    #    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")

    #    robShape = (2,)
    #    robShape = (3,)
    #    robShape = (200,)
    #    robShape = (16,)
    #    robShape = (64,)
    robShape = (8, 8, 1)

    #    robShape = (16,16,1)
    def make_obs_ph(name):
        #        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(robShape, name=name)

#    # these params are specific to mountaincar
#    def getOneHotObs(obs):
#        obsFraction = (obs[0] + 1.2) / 1.8
#        idx1 = np.int32(np.trunc(obsFraction*100))
#        obsFraction = (obs[1] + 0.07) / 0.14
#        idx2 = np.int32(np.trunc(obsFraction*100))
#        ident = np.identity(100)
#        return np.r_[ident[idx1,:],ident[idx2,:]]

# these params are specific to frozenlake

    def getOneHotObs(obs):
        #        ident = np.identity(16)
        ident = np.identity(64)
        #        ident = np.identity(256)
        #        return ident[obs,:]
        return np.reshape(ident[obs, :], [8, 8, 1])
#        return np.reshape(ident[obs,:],[16,16,1])

#    model = models.mlp([32])
#    model = models.mlp([64])
#    model = models.mlp([64], layer_norm=True)
#    model = models.mlp([16, 16])

# conv model parameters: (num_outputs, kernel_size, stride)

    model = models.cnn_to_mlp(
        #        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
        #        hiddens=[256],  # used in pong
        convs=[(8, 4, 1)],
        hiddens=[16],
        dueling=True)

    # parameters
    q_func = model
    lr = 1e-3
    #    max_timesteps=100000
    #    max_timesteps=50000
    max_timesteps = 20000
    buffer_size = 50000
    #    exploration_fraction=0.1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    #    exploration_final_eps=0.1
    train_freq = 1
    batch_size = 32
    print_freq = 10
    checkpoint_freq = 10000
    learning_starts = 1000
    gamma = 1.
    target_network_update_freq = 500
    prioritized_replay = False
    #    prioritized_replay=True
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    num_cpu = 16

    #    # try mountaincar w/ different input dimensions
    #    inputDims = [50,2]

    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        double_q=False)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    #    obs = np.reshape(obs,[8,8,1])
    #    obs = getOneHotObs(obs)

    #    with tempfile.TemporaryDirectory() as td:
    model_saved = False
    #        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        #        env.render()

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        #        new_obs = getOneHotObs(new_obs)

        #        env.render()

        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            #            obs = getOneHotObs(obs)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                              weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:

            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)

        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))


#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess

    num2avg = 20
    rListAvg = np.convolve(episode_rewards, np.ones(num2avg)) / num2avg
    plt.plot(rListAvg)
    #    plt.plot(episode_rewards)
    plt.show()

    sess
def train_agent(args):
    """

    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]

    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.locexp) + "/" + str(args.env_name) + '_agent_' + str(
        args.policy)
    pathname += "_batch_size_" + str(args.batch_size) + "_lr_act_" + str(
        args.lr_actor)
    pathname += "_lr_critc_" + str(args.lr_critic) + "_lr_decoder_"
    arg_text = str(args)
    write_into_file(pathname, arg_text)
    tensorboard_name = str(args.locexp) + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)
    size = args.size
    env = gym.make(args.env_name, renderer='egl')
    state = env.reset()
    print("state ", state.shape)
    state_dim = 200
    print("State dim, ", state_dim)
    action_dim = 5
    print("action_dim ", action_dim)
    max_action = 1
    args.target_entropy = -np.prod(action_dim)
    args.max_episode_steps = 200
    file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name)
    obs_shape = (args.history_length, size, size)
    action_shape = (action_dim, )
    print("obs", obs_shape)
    print("act", action_shape)
    policy = TQC(state_dim, action_dim, max_action, args)
    replay_buffer = ReplayBuffer(obs_shape, action_shape,
                                 int(args.buffer_size), args.image_pad,
                                 args.device)
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    tb_update_counter = 0
    # TODO: evaluate
    #evaluations.append(evaluate_policy(policy, writer, total_timesteps, args, env))
    #save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(episode_num, evaluations[-1], args.policy)
    #policy.save(save_model)
    done_counter = deque(maxlen=100)
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            #env.seed(random.randint(0, 100))
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if total_timesteps > args.start_timesteps and episode_num % args.update_beta_freq == 0:
                policy.update_beta(replay_buffer, writer, total_timesteps)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
                writer.flush()
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                if episode_timesteps < 50:
                    done_counter.append(1)
                else:
                    done_counter.append(0)
                goals = sum(done_counter)
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Goal last 100 ep : {} ".format(goals)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))
                writer.add_scalar('Goal_freq', goals, total_timesteps)

                print(text)
                write_into_file(pathname, text)
                #policy.train(replay_buffer, writer, episode_timesteps)
            # We evaluate the episode and we save the policy
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                torch.manual_seed(args.seed)
                np.random.seed(args.seed)
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(
                    episode_num, evaluations[-1], args.policy)
                policy.save(save_model)
            # When the training step is done, we reset the state of the environment
            state = env.reset()
            obs, state_buffer = stacked_frames(state, size, args, policy)

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:  # After 10000 timesteps, we switch to the model
            action = policy.select_action(obs)
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)
        # print(reward)
        #frame = cv2.imshow("wi", np.array(new_obs))
        #cv2.waitKey(10)
        done = float(done)
        new_obs, state_buffer = create_next_obs(new_obs, size, args,
                                                state_buffer, policy)

        # We check if the episode is done
        #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        if episode_timesteps + 1 == args.max_episode_steps:
            done = True
        # We increase the total reward
        reward = reward * args.reward_scalling
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        if args.debug:
            print("add to buffer next_obs ", obs.shape)
            print("add to bufferobs ", new_obs.shape)
        replay_buffer.add(obs, action, reward, new_obs, done, done_bool)
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        if total_timesteps > args.start_timesteps:
            for i in range(args.repeat_update):
                policy.train(replay_buffer, writer, 1)
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))
Exemple #19
0
TARGET_UPDATE_EPISODE = 10
EPS_START = 0.9
EPS_END = 0.1
EPS_DECAY = 10000
# we need two DQN network

policy = DQN(POLICY_ARGS).to(DEVICE)
# print(policy)
target = DQN(POLICY_ARGS).to(DEVICE)

policy_weight = policy.state_dict()
target.load_state_dict(policy_weight)
target.eval() # fixed the target net, we don't want to train it
mse = nn.MSELoss()
optimizer = optim.RMSprop(policy.parameters())
replay_buffer = ReplayBuffer(BUFFER_SIZE)

# training phase
total_game_step = 0
for current_episode in range(EPISODE):
    state = env.reset() # get the initial observation
    game_step = 0
    total_reward = 0
    state = torch.tensor([state]).float().to(DEVICE)
    while True:
        game_step += 1
        total_game_step += 1
        action = policy.act(state, total_game_step, isTrain = True).to(DEVICE) # sample an action
        next_state, reward, done, _ = env.step(action.item()) # take action in environment
        total_reward += reward
        reward = torch.tensor([reward]).float().to(DEVICE)
Exemple #20
0
        s = env.reset()
        reward = 0
        for _ in range(t_max):
            qvalues = agent.get_qvalues([s])
            action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]
            s, r, done, _ = env.step(action)
            reward += r
            if done: break

        rewards.append(reward)
    return np.mean(rewards)
evaluate(env, agent, n_games=1)


from replay_buffer import ReplayBuffer
exp_replay = ReplayBuffer(10)

for _ in range(30):
    exp_replay.add(env.reset(), env.action_space.sample(), 1.0, env.reset(), done=False)

obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(5)

assert len(exp_replay) == 10, "experience replay size should be 10 because that's what maximum capacity is"

def play_and_record(agent, env, exp_replay, n_steps=1):
    """
    Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer.
    Whenever game ends, add record with done=True and reset the game.
    :returns: return sum of rewards over time

    Note: please do not env.reset() unless env is done.
Exemple #21
0
def train(sess, env, args, actor, critic, actor_noise):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            #print("actor.s_dim: ", actor.s_dim)
            #print("s: ", s)
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r
            #print("s: ", s)
            #print("ep_reward: ", ep_reward)

            if terminal:

                summary_str = sess.run(
                    summary_ops,
                    feed_dict={
                        #summary_vars[0]: ep_reward,
                        summary_vars[0]: ep_reward,
                        summary_vars[1]: ep_ave_max_q / float(j)
                    })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward / float(j): {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                        i, (ep_ave_max_q / float(j))))
                break
Exemple #22
0
def train(sess, env, actor, critic):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in xrange(MAX_EP_STEPS):

            # Added exploration noise
            a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i + j))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j)
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                    '| Qmax: %.4f' % (ep_ave_max_q / float(j))

                break
Exemple #23
0
def main(_):
    """Run td3/ddpg training."""
    contrib_eager_python_tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    tf.gfile.MakeDirs(FLAGS.log_dir)
    summary_writer = contrib_summary.create_file_writer(FLAGS.log_dir,
                                                        flush_millis=10000)

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)

    if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']:
        rand_actions = int(1e4)
    else:
        rand_actions = int(1e3)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    if FLAGS.algo == 'td3':
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=True,
                              policy_update_freq=2,
                              actor_lr=1e-3)
    else:
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=False,
                              policy_update_freq=1,
                              actor_lr=1e-4)

    replay_buffer_var = contrib_eager_python_tfe.Variable('',
                                                          name='replay_buffer')
    gym_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='gym_random_state')
    np_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='np_random_state')
    py_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='py_random_state')

    saver = contrib_eager_python_tfe.Saver(
        model.variables + [replay_buffer_var] +
        [gym_random_state_var, np_random_state_var, py_random_state_var])
    tf.gfile.MakeDirs(FLAGS.save_dir)

    reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')
    eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables +
                                                [reward_scale])
    tf.gfile.MakeDirs(FLAGS.eval_save_dir)

    last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir)
    if last_checkpoint is None:
        replay_buffer = ReplayBuffer()
        total_numsteps = 0
        prev_save_timestep = 0
        prev_eval_save_timestep = 0
    else:
        saver.restore(last_checkpoint)
        replay_buffer = pickle.loads(zlib.decompress(
            replay_buffer_var.numpy()))
        total_numsteps = int(last_checkpoint.split('-')[-1])
        assert len(replay_buffer) == total_numsteps
        prev_save_timestep = total_numsteps
        prev_eval_save_timestep = total_numsteps
        env.unwrapped.np_random.set_state(
            pickle.loads(gym_random_state_var.numpy()))
        np.random.set_state(pickle.loads(np_random_state_var.numpy()))
        random.setstate(pickle.loads(py_random_state_var.numpy()))

    with summary_writer.as_default():
        while total_numsteps < FLAGS.training_steps:
            rollout_reward, rollout_timesteps = do_rollout(
                env,
                model.actor,
                replay_buffer,
                noise_scale=FLAGS.exploration_noise,
                rand_actions=rand_actions)
            total_numsteps += rollout_timesteps

            logging.info('Training: total timesteps %d, episode reward %f',
                         total_numsteps, rollout_reward)

            print('Training: total timesteps {}, episode reward {}'.format(
                total_numsteps, rollout_reward))

            with contrib_summary.always_record_summaries():
                contrib_summary.scalar('reward',
                                       rollout_reward,
                                       step=total_numsteps)
                contrib_summary.scalar('length',
                                       rollout_timesteps,
                                       step=total_numsteps)

            if len(replay_buffer) >= FLAGS.min_samples_to_start:
                for _ in range(rollout_timesteps):
                    time_step = replay_buffer.sample(
                        batch_size=FLAGS.batch_size)
                    batch = TimeStep(*zip(*time_step))
                    model.update(batch)

                if total_numsteps - prev_save_timestep >= FLAGS.save_interval:
                    replay_buffer_var.assign(
                        zlib.compress(pickle.dumps(replay_buffer)))
                    gym_random_state_var.assign(
                        pickle.dumps(env.unwrapped.np_random.get_state()))
                    np_random_state_var.assign(
                        pickle.dumps(np.random.get_state()))
                    py_random_state_var.assign(pickle.dumps(random.getstate()))

                    saver.save(os.path.join(FLAGS.save_dir, 'checkpoint'),
                               global_step=total_numsteps)
                    prev_save_timestep = total_numsteps

                if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval:
                    eval_saver.save(os.path.join(FLAGS.eval_save_dir,
                                                 'checkpoint'),
                                    global_step=total_numsteps)
                    prev_eval_save_timestep = total_numsteps
Exemple #24
0
def calc_po_best_response_PER(poacher, target_poacher, po_copy_op, po_good_copy_op, patrollers, pa_s, pa_type,
       iteration, sess, env, args, final_utility, starting_e, train_episode_num = None):
    '''
    Given a list of patrollers, and their types (DQN, PARAM, RS)
    Train a DQN poacher as the approximating best response
    Args:
        poacher: DQN poacher
        target_poacher: target DQN poacher
        po_copy_op: tensorflow copy opertaions, copy the weights from DQN to the target DQN
        po_good_copy_op: tensorflow copy operations,  save the trained ever-best poacher DQN
        patrollers: a list of patrollers
        pa_s: the patroller mixed startegy among the list of patrollers
        pa_type: a list specifying the type of each patroller, {'DQN', 'PARAM', 'RS'}
        iteration: the current DO iterations
        sess: tensorflow sess
        env: the game environment
        args: some args 
        final_utility: record the best response utility 
        starting_e: the starting of the training epoch
    Return:
        Nothing explictly returned  due to multithreading.
        The best response utility is returned in $final_utility$
        The best response DQN is copied through the $po_good_copy_op$
    '''

    #print('FIND_poacher_best_response iteration: ' + str(iteration))
    if train_episode_num is None:
        train_episode_num = args.po_episode_num

    decrease_time = 1.0 / args.epsilon_decrease
    epsilon_decrease_every = train_episode_num // decrease_time

    if not args.PER:
        replay_buffer = ReplayBuffer(args, args.po_replay_buffer_size)
    else:
        replay_buffer = PERMemory(args)
    pa_strategy = pa_s
    best_utility = -10000.0
    test_utility = []

    if starting_e == 0:
        log = open(args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat', 'w')
        test_log = open(args.save_path + 'po_log_test_iter_' + str(iteration) +  '.dat', 'w')
    else:
        log = open(args.save_path + 'po_log_train_iter_' + str(iteration) + '.dat', 'a')
        test_log = open(args.save_path + 'po_log_test_iter_' + str(iteration) +  '.dat', 'a')

    epsilon = 1.0
    learning_rate = args.po_initial_lr
    global_step = 0
    action_id = {
        ('still', 1): 0,
        ('up', 0): 1,
        ('down', 0): 2,
        ('left', 0): 3,
        ('right', 0): 4
    }

    sess.run(po_copy_op)

    for e in range(starting_e, starting_e + train_episode_num):
        if e > 0 and e % epsilon_decrease_every == 0:
            epsilon = max(0.1, epsilon - args.epsilon_decrease)
        if e % args.mix_every_episode == 0 or e == starting_e:
            pa_chosen_strat = np.argmax(np.random.multinomial(1, pa_strategy))
            patroller = patrollers[pa_chosen_strat]
            type = pa_type[pa_chosen_strat]
        # if args.gui == 1 and e > 0 and e % args.gui_every_episode == 0:
        #     test_gui(poacher, patroller, sess, args, pah = heurestic_flag, poh = False)

        ### reset the environment
        poacher.reset_snare_num()
        pa_state, po_state = env.reset_game()
        episode_reward = 0.0
        pa_action = 'still'

        for t in range(args.max_time):
            global_step += 1
            transition = []

            ### transition adds current state
            transition.append(po_state)

            ### poacher chooses an action, if it has not been caught/returned home
            if not env.catch_flag and not env.home_flag: 
                po_state = np.array([po_state])
                snare_flag, po_action = poacher.infer_action(sess=sess, states=po_state, policy="epsilon_greedy", epsilon=epsilon, po_loc=env.po_loc, animal_density=env.animal_density)
            else:
                snare_flag = True
                po_action = 'still'
            
            transition.append(action_id[(po_action, snare_flag)])

            ### patroller chooses an action
            ### Note that heuristic and DQN agent has different APIs
            if type == 'DQN':
                pa_state = np.array([pa_state])  # Make it 2-D, i.e., [batch_size(1), state_size]
                pa_action = patroller.infer_action(sess=sess, states=pa_state, policy="greedy", pa_loc=env.pa_loc, animal_density=env.animal_density)
            elif type == 'PARAM':
                pa_loc = env.pa_loc
                pa_action = patroller.infer_action(pa_loc, env.get_local_po_trace(pa_loc), 1.5, -2.0, 8.0)
            elif type == 'RS':
                pa_loc = env.pa_loc
                footprints = []
                actions = ['up', 'down', 'left', 'right']
                for i in range(4,8):
                    if env.po_trace[pa_loc[0], pa_loc[1]][i] == 1:
                        footprints.append(actions[i - 4])
                pa_action = patroller.infer_action(pa_loc, pa_action, footprints)


            pa_state, _, po_state, po_reward, end_game = \
              env.step(pa_action, po_action, snare_flag)

           
            ### transition adds reward, and the new state
            transition.append(po_reward)
            transition.append(po_state)
           
            episode_reward += po_reward
            
            ### Add transition to replay buffer
            replay_buffer.add_transition(transition)

            ### Start training
            ### Sample a minibatch
            if replay_buffer.size >= args.batch_size:

                if not args.PER:
                    train_state, train_action, train_reward, train_new_state = \
                        replay_buffer.sample_batch(args.batch_size)
                else:
                    train_state, train_action, train_reward,train_new_state, \
                      idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size)

                ### Double DQN get target
                max_index = poacher.get_max_q_index(sess=sess, states=train_new_state)
                max_q = target_poacher.get_q_by_index(sess=sess, states=train_new_state, index=max_index)

                q_target = train_reward + args.reward_gamma * max_q

                if args.PER:
                    q_pred = sess.run(poacher.output, {poacher.input_state: train_state})
                    q_pred = q_pred[np.arange(args.batch_size), train_action]
                    TD_error_batch = np.abs(q_target - q_pred)
                    replay_buffer.update(idx_batch, TD_error_batch)

                if not args.PER:
                    weight = np.ones(args.batch_size) 
                else:
                    weight = weight_batch 

                ### Update parameter
                feed = {
                    poacher.input_state: train_state,
                    poacher.actions: train_action,
                    poacher.q_target: q_target,
                    poacher.learning_rate: learning_rate,
                    poacher.loss_weight: weight
                }
                sess.run(poacher.train_op, feed_dict=feed)

            ### Update target network
            if global_step > 0 and global_step % args.target_update_every == 0:
                sess.run(po_copy_op)

            ### game ends: 1) the patroller catches the poacher and removes all the snares; 
            ###            2) the maximum time step is achieved
            if end_game or (t == args.max_time - 1):
                info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \
                       (e, t + 1, episode_reward, 1. * episode_reward / (t + 1))
                if  e % args.print_every == 0:
                    log.write(info + '\n')
                    print('po ' + info)
                    #log.flush()
                break

        ### save model
        if  e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1:
            save_name = args.save_path + 'iteration_' + str(iteration) +  '_epoch_'+ str(e) +  "_po_model.ckpt"
            poacher.save(sess=sess, filename=save_name)
            #print('Save model to ' + save_name)

        ### test 
        if e == train_episode_num - 1 or ( e > 0 and e % args.test_every_episode  == 0):
            po_utility = 0.0
            test_total_reward = np.zeros(len(pa_strategy))

            ### test against each patroller strategy in the current strategy set
            for pa_strat in range(len(pa_strategy)):
                if pa_strategy[pa_strat] > 1e-10:
                    _, test_total_reward[pa_strat], _ = test_(patrollers[pa_strat], poacher, \
                        env, sess,args, iteration, e, poacher_type = 'DQN', patroller_type = pa_type[pa_strat])
                    po_utility += pa_strategy[pa_strat] * test_total_reward[pa_strat]

            test_utility.append(po_utility)

            if po_utility > best_utility and (e > min(50000, train_episode_num / 2) or args.row_num == 3):
                best_utility = po_utility
                sess.run(po_good_copy_op)
                final_utility[1] = po_utility
            
            info = [str(po_utility)] + [str(x)  for x in test_total_reward]
            info = 'test   '  + str(e) + '   ' +  '\t'.join(info) + '\n'
            #print('reward is: ', info)
            print('po ' + info)
            test_log.write(info)
            test_log.flush()

    test_log.close()
    log.close()
Exemple #25
0
def cbf(
        rank,
        env,
        sess,
        env_name,
        seed,
        debug,
        tensorboard,
        idf,
        replay_size,  # size of replay buffer
        batch_size,  # size of minibatch
        n_timesteps,  # number of timesteps
        len_rollouts,  # length of each rollout
        n_optimizations,  # number of optimization steps
        embedding_space_size,  # size of embeddings
        learning_rate,  # learning rate of forward dynamics
        joint_training=False,
        using_extrinsic_reward=False):
    # Initialize models
    emb = CnnEmbedding("embedding", env.observation_space, env.action_space,
                       embedding_space_size)
    fd = ForwardDynamics(
        "forward_dynamics", embedding_space_size,
        env.action_space) if not using_extrinsic_reward else None
    idf = InverseDynamics("inverse_dynamics", env.observation_space,
                          env.action_space, embedding_space_size,
                          emb) if idf else None
    policy = Policy("policy_new",
                    env.action_space,
                    joint_training,
                    emb_size=embedding_space_size,
                    emb_network=emb)
    ppo = PPO(
        env,
        policy,
        emb_network=emb,
        emb_size=embedding_space_size,
        max_timesteps=int(n_timesteps),
        clip_param=0.2,
        entcoeff=0.001,
        optim_epochs=8,
        optim_stepsize=1e-3,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        joint_training=joint_training,
    )

    if tensorboard:
        merged_summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter('tmp/tensorflow/', sess.graph)

    if not debug and rank == 0:
        cur_time = datetime.datetime.today().strftime('%Y_%m_%d_%H_%M_%S')
        directory = 'results/' + cur_time
        if not os.path.exists(directory):
            os.makedirs(directory)
        directory_m = 'model/' + cur_time
        if not os.path.exists(directory_m):
            os.makedirs(directory_m)

        txt = 'Running with env:%s, seed:%s, num timesteps:%s, joint-training:%s, using-extrinsic-reward:%s\n\n' \
               % (env_name, seed, n_timesteps, joint_training, using_extrinsic_reward)
        txt += 'Hyperparameters:\n - replay size:%s\n - batch size:%s\n - length of rollout:%s\n - number of optimization steps:%s\n - ' \
               % (replay_size, batch_size, len_rollouts, n_optimizations)
        txt += 'size of embedding:%s\n - learning rate of forward dynamics:%s\n\n' \
               % (embedding_space_size, learning_rate)
        txt += 'For inference on model, run:\n'
        txt += 'python3 cbf.py --env %s --seed %s --joint-training %s --using-extrinsic-reward %s ' \
               % (env_name, seed, joint_training, using_extrinsic_reward)
        txt += '--inference True --path-to-model %s' % (directory_m +
                                                        '/model.ckpt')
        with open(directory + '/info.txt', 'w+') as txt_file:
            txt_file.write(txt)

    replay_memory = ReplayBuffer(replay_size)
    sess = tf.get_default_session()
    # sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()

    t = 0
    i = 0

    # initialize optimization batch variables
    a = env.action_space.sample()  # not used, just so we have the datatype
    done = True  # marks if we're on first timestep of an episode
    s = env.reset()

    cur_ep_ret = 0  # return in current episode
    cur_ep_len = 0  # len of current episode
    ep_rets = []  # returns of completed episodes in this segment
    ep_lens = []  # lengths of ...

    # Initialize history arrays
    if joint_training:
        s_arr = np.array([np.zeros([84, 84, 4]) for _ in range(len_rollouts)])
    else:
        s_arr = np.array(
            [np.zeros(embedding_space_size) for _ in range(len_rollouts)])
    r_arr = np.zeros(len_rollouts, 'float32')
    vpreds = np.zeros(len_rollouts, 'float32')
    dones = np.zeros(len_rollouts, 'int32')
    a_arr = np.array([a for _ in range(len_rollouts)])

    # For graphing
    best_reward = -float("inf")
    cur_reward = 0
    graph_rewards = []
    graph_best_rewards = []
    graph_epi_lens = []
    graph_in_rewards = []
    graph_avg_rewards = []
    graph_avg_epi_lens = []
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths

    while True:
        for j in range(len_rollouts):
            if not debug and rank == 0 and t > 0 and t % int(1e3) == 0:
                # print('# frame: %i. Best reward so far: %i.' % (t, best_reward,))
                save_to_file(directory, env_name, graph_rewards,
                             graph_best_rewards, graph_epi_lens,
                             graph_in_rewards, graph_avg_rewards,
                             graph_avg_epi_lens)

                save_path = saver.save(sess, directory_m + '/model.ckpt')
                save_path = saver.save(sess, 'model/model.ckpt')
                #print("Model saved in file: %s" % save_path)

            if tensorboard and rank == 0 and t > 0 and t % int(1e3) == 0:
                summary = sess.run(merged_summary_op)
                writer.add_summary(summary, i)

            s = np.array(s)
            obs1 = emb.embed([s])
            if joint_training:
                a, vpred = policy.act([s])
            else:
                a, vpred = policy.act(obs1)

            # update optimization batch variables
            idx = t % len_rollouts
            s_arr[idx] = s if joint_training else obs1
            vpreds[idx] = vpred
            dones[idx] = done
            a_arr[idx] = a

            s_, ext_r, done, _ = env.step(a)

            cur_reward += ext_r

            s_ = np.array(s_)

            # compute intrinsic reward
            obs2 = emb.embed([s_])
            r = fd.get_loss(obs1, obs2,
                            np.eye(env.action_space.n)
                            [a]) if not using_extrinsic_reward else ext_r

            replay_memory.add(s, a, r, s_, done)
            if t > 0 and t % int(2e2) == 0:
                graph_in_rewards.append((r, i))

            # update optimization batch variables
            r_arr[idx] = r
            cur_ep_ret += r
            cur_ep_len += 1

            # Prepare for next step
            if done:
                rewbuffer.append(cur_reward)
                lenbuffer.append(cur_ep_len)
                graph_rewards.append((cur_reward, i))
                graph_epi_lens.append((cur_ep_len, i))
                ep_rets.append(cur_reward)
                ep_lens.append(cur_ep_len)
                cur_ep_ret = 0
                cur_ep_len = 0
                if cur_reward > best_reward:
                    best_reward = cur_reward
                graph_best_rewards.append((best_reward, i))
                graph_avg_rewards.append((sum(rewbuffer) / len(rewbuffer), i))
                graph_avg_epi_lens.append((sum(lenbuffer) / len(lenbuffer), i))
                cur_reward = 0
                s = env.reset()
            else:
                s = s_
            t += 1
            i += 1
        ppo.prepare({
            "ob": s_arr,
            "rew": r_arr,
            "vpred": vpreds,
            "new": dones,
            "ac": a_arr,
            "nextvpred": vpred * (1 - done),
            "ep_rets": ep_rets,
            "ep_lens": ep_lens
        })
        ep_rets = []
        ep_lens = []
        for j in range(n_optimizations):
            # optimize theta_pi (and optionally theta_phi) wrt PPO loss
            ppo.step()
            # sample minibatch M from replay buffer R
            states, actions, rewards, next_states, _ = replay_memory.sample(
                batch_size)
            obs1, obs2 = emb.embed(states), emb.embed(
                next_states)  # embedding of states
            actions_hot = np.squeeze(
                [np.eye(env.action_space.n)[action] for action in actions])
            # optimize theta_f wtf forward dynamics loss on minibatch
            if not using_extrinsic_reward:
                fd.train(obs1, obs2, actions_hot, learning_rate)
            # optionally optimize theta_phi, theta_g wrt to auxilary loss
            if idf: idf.train(states, next_states, actions_hot, learning_rate)
        ppo.log()
        if ppo.timesteps_so_far >= n_timesteps:
            break
        i = ppo.timesteps_so_far

    if not debug and rank == 0:
        save_to_file(directory, env_name, graph_rewards, graph_best_rewards,
                     graph_epi_lens, graph_in_rewards, graph_avg_rewards,
                     graph_avg_epi_lens)
Exemple #26
0
def calc_pa_best_response_PER(patroller, target_patroller, pa_copy_op, pa_good_copy_op, poachers, po_strategy, po_type,
        iteration, sess, env, args, final_utility, starting_e, train_episode_num = None, po_locations = None):
    
    '''
    po_locations: if is purely global mode, then po_locations is None
        else, it is the local + global retrain mode. each entry of po_locations specify the local mode of that poacher.
    Other things are basically the same as the function 'calc_po_best_response_PER'
    '''

    po_location = None

    #print('FIND_patroller_best_response iteration: ' + str(iteration))
    if train_episode_num is None:
        train_episode_num = args.pa_episode_num

    decrease_time = 1.0 / args.epsilon_decrease
    epsilon_decrease_every = train_episode_num // decrease_time

    if not args.PER:
        replay_buffer = ReplayBuffer(args, args.pa_replay_buffer_size)
    else:
        replay_buffer = PERMemory(args)
    best_utility = -10000.0
    test_utility = []

    if starting_e == 0:
        log = open(args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat', 'w')
        test_log = open(args.save_path + 'pa_log_test_iter_' + str(iteration) +  '.dat', 'w')
    else:
        log = open(args.save_path + 'pa_log_train_iter_' + str(iteration) + '.dat', 'a')
        test_log = open(args.save_path + 'pa_log_test_iter_' + str(iteration) +  '.dat', 'a')

    epsilon = 1.0
    learning_rate = args.po_initial_lr
    global_step = 0
    action_id = {
        'still': 0,
        'up': 1,
        'down': 2,
        'left': 3,
        'right': 4
    }

    sess.run(pa_copy_op)

    for e in range(starting_e, starting_e + train_episode_num):
        if e > 0 and e % epsilon_decrease_every == 0:
            epsilon = max(0.1, epsilon - args.epsilon_decrease)
        if e % args.mix_every_episode == 0 or e == starting_e:
            po_chosen_strat = np.argmax(np.random.multinomial(1, po_strategy))
            poacher = poachers[po_chosen_strat]
            type = po_type[po_chosen_strat]
            if po_locations is not None: # loacl + global mode, needs to change the poacher mode
                po_location = po_locations[po_chosen_strat]


        ### reset the environment
        poacher.reset_snare_num()
        pa_state, po_state = env.reset_game(po_location)
        episode_reward = 0.0
        pa_action = 'still'

        for t in range(args.max_time):
            global_step += 1

            ### transition records the (s,a,r,s) tuples
            transition = []

            ### poacher chooses an action
            ### doing so is because heuristic and DQN agent has different infer_action API
            if type == 'DQN':
                if not env.catch_flag and not env.home_flag: # if poacher is not caught, it can still do actions
                    po_state = np.array([po_state])
                    snare_flag, po_action = poacher.infer_action(sess=sess, states=po_state, policy="greedy", po_loc=env.po_loc, animal_density=env.animal_density)
                else: ### however, if it is caught, just make it stay still and does nothing
                    snare_flag = 0
                    po_action = 'still'
            elif type == 'PARAM':
                po_loc = env.po_loc
                if not env.catch_flag and not env.home_flag: 
                    snare_flag, po_action = poacher.infer_action(loc=po_loc,
                                                                local_trace=env.get_local_pa_trace(po_loc),
                                                                local_snare=env.get_local_snare(po_loc),
                                                                initial_loc=env.po_initial_loc)
                else:
                    snare_flag = 0
                    po_action = 'still'

            ### transition appends the current state
            transition.append(pa_state)

            ### patroller chooses an action
            pa_state = np.array([pa_state])
            pa_action = patroller.infer_action(sess=sess, states=pa_state, policy="epsilon_greedy", epsilon=epsilon, pa_loc=env.pa_loc, animal_density=env.animal_density)

            ### transition adds action
            transition.append(action_id[pa_action])

            ### the game moves on a step.
            pa_state, pa_reward, po_state, _, end_game = \
              env.step(pa_action, po_action, snare_flag)

            ### transition adds reward and the next state 
            episode_reward += pa_reward
            transition.append(pa_reward)
            transition.append(pa_state)

            ### Add transition to replay buffer
            replay_buffer.add_transition(transition)

            ### Start training
            ### Sample a minibatch, if the replay buffer has been full
            if replay_buffer.size >= args.batch_size:
                if not args.PER:
                    train_state, train_action, train_reward, train_new_state = \
                        replay_buffer.sample_batch(args.batch_size)
                else:
                    train_state, train_action, train_reward,train_new_state, \
                      idx_batch, weight_batch = replay_buffer.sample_batch(args.batch_size)

                ### Double DQN get target
                max_index = patroller.get_max_q_index(sess=sess, states=train_new_state)
                max_q = target_patroller.get_q_by_index(sess=sess, states=train_new_state, index=max_index)

                q_target = train_reward + args.reward_gamma * max_q

                if args.PER:
                    q_pred = sess.run(patroller.output, {patroller.input_state: train_state})
                    q_pred = q_pred[np.arange(args.batch_size), train_action]
                    TD_error_batch = np.abs(q_target - q_pred)
                    replay_buffer.update(idx_batch, TD_error_batch)

                if not args.PER:
                    weight = np.ones(args.batch_size)
                else:
                    weight = weight_batch 

                ### Update parameter
                feed = {
                    patroller.input_state: train_state,
                    patroller.actions: train_action,
                    patroller.q_target: q_target,
                    patroller.learning_rate: learning_rate,
                    patroller.weight_loss: weight
                }
                sess.run(patroller.train_op, feed_dict=feed)

            ### Update target network
            if global_step % args.target_update_every == 0:
                sess.run(pa_copy_op)

            ### game ends: 1) the patroller catches the poacher and removes all the snares; 
            ###            2) the maximum time step is achieved
            if end_game or (t == args.max_time - 1):
                info = str(e) + "\tepisode\t%s\tlength\t%s\ttotal_reward\t%s\taverage_reward\t%s" % \
                       (e, t + 1, episode_reward, 1. * episode_reward / (t + 1))
                if  e % args.print_every == 0:
                    log.write(info + '\n')
                    print('pa ' + info)
                    # log.flush()
                break


        ### save the models, and test if they are good 
        if  e > 0 and e % args.save_every_episode == 0 or e == train_episode_num - 1:
            save_name = args.save_path + 'iteration_' + str(iteration) + '_epoch_' + str(e) +  "_pa_model.ckpt"
            patroller.save(sess=sess, filename=save_name)

        ### test the agent
        if e == train_episode_num - 1 or ( e > 0 and e % args.test_every_episode  == 0):
            ### test against each strategy the poacher is using now, compute the expected utility
            pa_utility = 0.0
            test_total_reward = np.zeros(len(po_strategy))
            for po_strat in range(len(po_strategy)):
                if po_strategy[po_strat] > 1e-10:
                    if po_locations is None: ### indicates the purely global mode
                        tmp_po_location = None
                    else: ### indicates the local + global retrain mode, needs to set poacher mode
                        tmp_po_location = po_locations[po_strat]
                    test_total_reward[po_strat], _, _ = test_(patroller, poachers[po_strat], \
                            env, sess,args, iteration, e, patroller_type='DQN', poacher_type=po_type[po_strat],
                                po_location=tmp_po_location)
                    ### update the expected utility
                    pa_utility += po_strategy[po_strat] * test_total_reward[po_strat]

            test_utility.append(pa_utility)

            if pa_utility > best_utility and (e > min(50000, train_episode_num / 2) or args.row_num == 3):
                best_utility = pa_utility
                sess.run(pa_good_copy_op)
                final_utility[0] = pa_utility

            info = [str(pa_utility)] + [str(x)  for x in test_total_reward]
            info = 'test  ' + str(e) + '   ' +  '\t'.join(info) + '\n'
            #print('reward is: ', info)
            print('pa ' + info)
            test_log.write(info)
            test_log.flush()

    test_log.close()
    log.close()
Exemple #27
0
def train(env, model, max_steps, name, logdir, logger):
    target_model = create_model(env)
    replay = ReplayBuffer(REPLAY_BUFFER_SIZE)
    done = True
    episode = 0
    steps_after_logging = 0
    loss = 0.0
    for step in range(1, max_steps + 1):
        try:
            if step % SNAPSHOT_EVERY == 0:
                save_model(model, step, logdir, name)
            if done:
                if episode > 0:
                    if steps_after_logging >= LOG_EVERY:
                        steps_after_logging = 0
                        episode_end = time()
                        episode_seconds = episode_end - episode_start
                        episode_steps = step - episode_start_step
                        steps_per_second = episode_steps / episode_seconds
                        memory = psutil.virtual_memory()
                        to_gb = lambda in_bytes: in_bytes / 1024 / 1024 / 1024
                        print("episode {} "
                              "steps {}/{} "
                              "loss {:.7f} "
                              "return {} "
                              "in {:.2f}s "
                              "{:.1f} steps/s "
                              "{:.1f}/{:.1f} GB RAM".format(
                                  episode,
                                  episode_steps,
                                  step,
                                  loss,
                                  episode_return,
                                  episode_seconds,
                                  steps_per_second,
                                  to_gb(memory.used),
                                  to_gb(memory.total),
                              ))
                        logger.log_scalar('episode_return', episode_return,
                                          step)
                        logger.log_scalar('episode_steps', episode_steps, step)
                        logger.log_scalar('episode_seconds', episode_seconds,
                                          step)
                        logger.log_scalar('steps_per_second', steps_per_second,
                                          step)
                        logger.log_scalar('memory_used', to_gb(memory.used),
                                          step)
                        logger.log_scalar('loss', loss, step)
                episode_start = time()
                episode_start_step = step
                obs = env.reset()
                episode += 1
                episode_return = 0.0
            else:
                obs = next_obs

            action = epsilon_greedy_action(env,
                                           model,
                                           obs,
                                           epsilon=TRAIN_EPSILON)
            next_obs, reward, done, _ = env.step(action)
            episode_return += reward
            replay.add(obs, action, reward, next_obs, done)

            if step >= TRAIN_START:
                if step % TARGET_UPDATE_EVERY == 0:
                    target_model.set_weights(model.get_weights())
                batch = replay.sample(BATCH_SIZE)
                loss = fit_batch(env, model, target_model, batch)
            if step == Q_VALIDATION_SIZE:
                q_validation_observations, _, _, _, _ = replay.sample(
                    Q_VALIDATION_SIZE)
            if step >= TRAIN_START and step % EVAL_EVERY == 0:
                episode_return_avg = evaluate(env, model)
                q_values = predict(env, model, q_validation_observations)
                max_q_values = np.max(q_values, axis=1)
                avg_max_q_value = np.mean(max_q_values)
                print("episode {} "
                      "step {} "
                      "episode_return_avg {:.3f} "
                      "avg_max_q_value {:.3f}".format(
                          episode,
                          step,
                          episode_return_avg,
                          avg_max_q_value,
                      ))
                logger.log_scalar('episode_return_avg', episode_return_avg,
                                  step)
                logger.log_scalar('avg_max_q_value', avg_max_q_value, step)
            steps_after_logging += 1
        except KeyboardInterrupt:
            save_model(model, step, logdir, name)
            break
def main():

#    env = envstandalone.BallCatch()
    env = envstandalone.TestRob3Env()
    
    max_timesteps=40000
    learning_starts=1000
    buffer_size=50000
#    buffer_size=1000
    exploration_fraction=0.2
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
    target_network_update_freq=500
    learning_alpha = 0.2
    
    batch_size=32
    train_freq=1

    obsShape = (8,8,1)
    deicticShape = (3,3,1)
    num_deictic_patches=36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu=16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i+windowLen,j:j+windowLen,:])
        return np.array(deicticObs)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
#        convs=[(16,3,1)],
        convs=[(16,2,1)],
#        convs=[(32,3,1)],
        hiddens=[16],
#        hiddens=[64],
#        dueling=True
        dueling=False
    )

    q_func=model
#    lr=1e-3
    lr=0.001
    
    def make_obs_ph(name):
#        return U.BatchInput(deicticShape, name=name)
        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False
    )

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current q-values: neural network version        
        qCurr = getq(np.array([obs]))
        
        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise,1)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        
#        # debug
#        if t > 5000:
#            print("obs:\n" + str(np.squeeze(obs)))
#            print("qCurr:\n" + str(qCurr))
#            print("action: " + str(action) + ", patch: " + str(selPatch))
#            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
#            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
#            action
            
        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
            actions = np.int32(np.reshape(actions,[batch_size,]))
            
            # Get curr, next values: neural network version
            qNext = getq(obses_tp1)
            qCurr = getq(obses_t)

            # Get targets
            qNextmax = np.max(qNext,1)
            targets = rewards + (1-dones) * gamma * qNextmax

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actions == i
                qCurrTargets[:,i] = myActions * targets + (1 - myActions) * qCurr[:,i]
            
            # Update values: neural network version
            td_error_out, obses_out, targets_out = targetTrain(
                    obses_t,
                    qCurrTargets
                    )

            td_error_pre = qCurr[range(batch_size),actions] - targets
            
#            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            # neural network version
            qCurr = getq(obses_t)
            
            td_error_post = qCurr[range(batch_size),actions] - targets
#            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

                
        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
Exemple #29
0
def run_for_config(config, print_messages):
    # set the name of the model
    model_name = config['general']['name']
    now = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y_%m_%d_%H_%M_%S')
    model_name = now + '_' + model_name if model_name is not None else now

    # openrave_interface = OpenraveRLInterface(config, None)
    random_seed = config['general']['random_seed']
    np.random.seed(random_seed)
    random.seed(random_seed)
    tf.set_random_seed(random_seed)

    # where we save all the outputs (outputs will be saved according to the scenario)
    scenario = config['general']['scenario']
    working_dir = os.path.join(get_base_directory(), scenario)
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)
    saver_dir = os.path.join(working_dir, 'models', model_name)
    if not os.path.exists(saver_dir):
        os.makedirs(saver_dir)
    best_model_path = None
    config_copy_path = os.path.join(working_dir, 'models', model_name,
                                    'config.yml')
    summaries_dir = os.path.join(working_dir, 'tensorboard', model_name)
    completed_trajectories_dir = os.path.join(working_dir, 'trajectories',
                                              model_name)

    # load images if required
    image_cache = None
    if _is_vision(scenario):
        image_cache = ImageCache(config['general']['params_file'],
                                 create_images=True)

    # load pretrained model if required
    pre_trained_reward = None
    if config['model']['use_reward_model']:
        reward_model_name = config['model']['reward_model_name']
        pre_trained_reward = PreTrainedReward(reward_model_name, config)

    # generate graph:
    network = Network(config,
                      is_rollout_agent=False,
                      pre_trained_reward=pre_trained_reward)

    def unpack_state_batch(state_batch):
        joints = [state[0] for state in state_batch]
        poses = {
            p.tuple: [state[1][p.tuple] for state in state_batch]
            for p in network.potential_points
        }
        jacobians = None
        return joints, poses, jacobians

    def score_for_hindsight(augmented_buffer):
        assert _is_vision(scenario)
        # unzip
        goal_pose_list, goal_joints_list, workspace_image_list, current_state_list, action_used_list, _, is_goal_list,\
        __ = zip(*augmented_buffer)
        # make one hot status vector:
        is_goal_one_hot_list = np.zeros((len(is_goal_list), 3),
                                        dtype=np.float32)
        for i in range(len(is_goal_list)):
            if is_goal_list[i]:
                is_goal_one_hot_list[i, 2] = 1.0  # mark as goal transition
            else:
                is_goal_one_hot_list[i, 0] = 1.0  # mark as free transition
        # unpack current and next state
        current_joints, _, __ = unpack_state_batch(current_state_list)

        fake_rewards, _ = pre_trained_reward.make_prediction(
            sess,
            current_joints,
            goal_joints_list,
            action_used_list,
            goal_pose_list,
            all_transition_labels=is_goal_one_hot_list)
        return list(fake_rewards)

    # initialize replay memory
    replay_buffer = ReplayBuffer(config)
    hindsight_policy = HindsightPolicy(config, replay_buffer,
                                       score_for_hindsight)

    # save model
    latest_saver = tf.train.Saver(max_to_keep=2, save_relative_paths=saver_dir)
    best_saver = tf.train.Saver(max_to_keep=2, save_relative_paths=saver_dir)
    yaml.dump(config, open(config_copy_path, 'w'))
    summaries_collector = SummariesCollector(summaries_dir, model_name)
    rollout_manager = FixedRolloutManager(config, image_cache=image_cache)
    trajectory_eval = TrajectoryEval(config, rollout_manager,
                                     completed_trajectories_dir)

    test_results = []

    def update_model(sess, global_step):
        batch_size = config['model']['batch_size']
        gamma = config['model']['gamma']
        replay_buffer_batch = replay_buffer.sample_batch(batch_size)

        goal_pose, goal_joints, workspace_id, current_state, action, reward, terminated, next_state = \
            replay_buffer_batch

        # get image from image cache
        workspace_image = None
        if image_cache is not None:
            workspace_image = [image_cache.get_image(k) for k in workspace_id]

        current_joints, _, __ = unpack_state_batch(current_state)
        next_joints, _, __ = unpack_state_batch(next_state)

        # get the predicted q value of the next state (action is taken from the target policy)
        next_state_action_target_q = network.predict_policy_q(
            next_joints,
            workspace_image,
            goal_pose,
            goal_joints,
            sess,
            use_online_network=False)

        # compute critic label
        q_label = np.expand_dims(
            np.squeeze(np.array(reward)) +
            np.multiply(np.multiply(1 - np.array(terminated), gamma),
                        np.squeeze(next_state_action_target_q)), 1)
        max_label = np.max(q_label)
        min_label = np.min(q_label)
        limit = 1.0 / (1.0 - gamma)
        if max_label > limit:
            print 'out of range max label: {} limit: {}'.format(
                max_label, limit)
        if min_label < -limit:
            print 'out of range min label: {} limit: {}'.format(
                min_label, limit)

        # # step to use for debug:
        # network.debug_all(current_joints, workspace_image, goal_pose, goal_joints, action, q_label, sess)

        # train critic given the targets
        critic_optimization_summaries, _ = network.train_critic(
            current_joints, workspace_image, goal_pose, goal_joints, action,
            q_label, sess)

        # train actor
        actor_optimization_summaries, _ = network.train_actor(
            current_joints, workspace_image, goal_pose, goal_joints, sess)

        # update target networks
        network.update_target_networks(sess)

        result = [
            critic_optimization_summaries,
            actor_optimization_summaries,
        ]
        return result

    def print_state(prefix, episodes, successful_episodes, collision_episodes,
                    max_len_episodes):
        if not print_messages:
            return
        print '{}: {}: finished: {}, successful: {} ({}), collision: {} ({}), max length: {} ({})'.format(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y-%m-%d %H:%M:%S'), prefix, episodes,
            successful_episodes,
            float(successful_episodes) / episodes, collision_episodes,
            float(collision_episodes) / episodes, max_len_episodes,
            float(max_len_episodes) / episodes)

    def process_example_trajectory(episode_example_trajectory,
                                   episode_agent_trajectory):
        # creates an episode by computing the actions, and setting the rewards to None (will be calculated later)
        _, __, ___, ____, goal_pose, goal_joints, workspace_id = episode_agent_trajectory
        example_trajectory, example_trajectory_poses = episode_example_trajectory
        example_trajectory = [j[1:] for j in example_trajectory]
        # goal reached always
        status = 3
        # get the states (joints, poses, jacobians), for now, ignore the jacobians.
        states = [(example_trajectory[i], example_trajectory_poses[i], None)
                  for i in range(len(example_trajectory))]
        # compute the actions by normalized difference between steps
        actions = [
            np.array(example_trajectory[i + 1]) -
            np.array(example_trajectory[i])
            for i in range(len(example_trajectory) - 1)
        ]
        actions = [a / max(np.linalg.norm(a), 0.00001) for a in actions]

        rewards = [-config['openrave_rl']['keep_alive_penalty']
                   ] * (len(actions) - 1) + [1.0]
        return status, states, actions, rewards, goal_pose, goal_joints, workspace_id

    def do_test(sess, best_model_global_step, best_model_test_success_rate):
        rollout_manager.set_policy_weights(network.get_actor_weights(
            sess, is_online=False),
                                           is_online=False)
        eval_result = trajectory_eval.eval(
            global_step, config['test']['number_of_episodes'])
        test_episodes = eval_result[0]
        test_successful_episodes = eval_result[1]
        test_collision_episodes = eval_result[2]
        test_max_len_episodes = eval_result[3]
        test_mean_reward = eval_result[4]
        if print_messages:
            print_state('test', test_episodes, test_successful_episodes,
                        test_collision_episodes, test_max_len_episodes)
            print('test mean total reward {}'.format(test_mean_reward))
        summaries_collector.write_test_episode_summaries(
            sess, global_step, test_episodes, test_successful_episodes,
            test_collision_episodes, test_max_len_episodes)
        test_results.append(
            (global_step, episodes, test_successful_episodes,
             test_collision_episodes, test_max_len_episodes, test_mean_reward))
        # see if best
        rate = test_successful_episodes / float(test_episodes)
        if best_model_test_success_rate < rate:
            if print_messages:
                print 'new best model found at step {}'.format(global_step)
                print 'old success rate {} new success rate {}'.format(
                    best_model_test_success_rate, rate)
            is_best = True
            best_model_global_step = global_step
            best_model_test_success_rate = rate
        else:
            is_best = False
            if print_messages:
                print 'best model still at step {}'.format(
                    best_model_global_step)
        return is_best, best_model_global_step, best_model_test_success_rate

    def do_end_of_run_validation(sess):
        # restores the model first
        best_saver.restore(sess, best_model_path)
        # set the weights
        rollout_manager.set_policy_weights(network.get_actor_weights(
            sess, is_online=False),
                                           is_online=False)
        eval_result = trajectory_eval.eval(
            -1, config['validation']['number_of_episodes'])
        test_episodes = eval_result[0]
        test_successful_episodes = eval_result[1]
        test_collision_episodes = eval_result[2]
        test_max_len_episodes = eval_result[3]
        test_mean_reward = eval_result[4]
        if print_messages:
            print_state('validation (best model)', test_episodes,
                        test_successful_episodes, test_collision_episodes,
                        test_max_len_episodes)
            print('validation (best model) mean total reward {}'.format(
                test_mean_reward))
        test_results.append(
            (-1, episodes, test_successful_episodes, test_collision_episodes,
             test_max_len_episodes, test_mean_reward))
        # see if best
        rate = test_successful_episodes / float(test_episodes)
        print 'final success rate is {}'.format(rate)
        return rate

    allowed_batch_episode_editor = config['model']['batch_size'] if _is_vision(
        scenario) else None
    regular_episode_editor = EpisodeEditor(
        config['model']['alter_episode'],
        pre_trained_reward,
        image_cache=image_cache,
        allowed_batch=allowed_batch_episode_editor)
    motion_planner_episode_editor = EpisodeEditor(
        config['model']['alter_episode_expert'],
        pre_trained_reward,
        image_cache=image_cache,
        allowed_batch=allowed_batch_episode_editor)

    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            per_process_gpu_memory_fraction=config['general']
        ['gpu_usage']))) as sess:
        sess.run(tf.global_variables_initializer())
        if pre_trained_reward is not None:
            pre_trained_reward.load_weights(sess)
        network.update_target_networks(sess)

        global_step = 0
        episodes = successful_episodes = collision_episodes = max_len_episodes = 0
        best_model_global_step, best_model_test_success_rate = -1, -1.0
        for update_index in range(config['general']['updates_cycle_count']):
            # collect data
            a = datetime.datetime.now()
            rollout_manager.set_policy_weights(network.get_actor_weights(
                sess, is_online=True),
                                               is_online=True)
            episodes_per_update = config['general']['episodes_per_update']
            episode_results = rollout_manager.generate_episodes(
                episodes_per_update, True)
            episodes_agent_trajectory, episodes_times, episodes_example_trajectory = zip(
                *episode_results)

            # alter the episodes based on reward model
            altered_episodes = regular_episode_editor.process_episodes(
                episodes_agent_trajectory, sess)

            # process example episodes for failed interactions
            altered_motion_planner_episodes = []
            failed_motion_planner_trajectories = config['model'][
                'failed_motion_planner_trajectories']
            if failed_motion_planner_trajectories > 0:
                # take a small number of failed motion plans
                failed_episodes_indices = [
                    i for i in range(len(altered_episodes))
                    if altered_episodes[i][0] != 3
                ]
                failed_episodes_indices = failed_episodes_indices[:
                                                                  failed_motion_planner_trajectories]
                motion_planner_episodes = [
                    process_example_trajectory(episodes_example_trajectory[i],
                                               altered_episodes[i])
                    for i in failed_episodes_indices
                ]
                altered_motion_planner_episodes = motion_planner_episode_editor.process_episodes(
                    motion_planner_episodes, sess)

            # add to replay buffer
            hindsight_policy.append_to_replay_buffer(
                list(altered_episodes) + list(altered_motion_planner_episodes))

            # compute times
            total_find_trajectory_time = None
            total_rollout_time = None
            for episode_times in episodes_times:
                # update the times
                find_trajectory_time, rollout_time = episode_times
                if total_find_trajectory_time is None:
                    total_find_trajectory_time = find_trajectory_time
                else:
                    total_find_trajectory_time += find_trajectory_time
                if total_rollout_time is None:
                    total_rollout_time = rollout_time
                else:
                    total_rollout_time += rollout_time

            # compute counters
            for altered_episode in altered_episodes:
                status = altered_episode[0]
                episodes += 1
                if status == 1:
                    max_len_episodes += 1
                elif status == 2:
                    collision_episodes += 1
                elif status == 3:
                    successful_episodes += 1

            b = datetime.datetime.now()
            print 'data collection took: {}'.format(b - a)
            print 'find trajectory took: {}'.format(total_find_trajectory_time)
            print 'rollout time took: {}'.format(total_rollout_time)
            print_state('train', episodes, successful_episodes,
                        collision_episodes, max_len_episodes)

            # do updates
            if replay_buffer.size() > config['model']['batch_size']:
                a = datetime.datetime.now()
                for _ in range(config['general']['model_updates_per_cycle']):
                    summaries = update_model(sess, global_step)
                    if global_step % config['general'][
                            'write_train_summaries'] == 0:
                        summaries_collector.write_train_episode_summaries(
                            sess, global_step, episodes, successful_episodes,
                            collision_episodes, max_len_episodes)
                        summaries_collector.write_train_optimization_summaries(
                            summaries, global_step)
                    global_step += 1
                b = datetime.datetime.now()
                print 'update took: {}'.format(b - a)

            # test if needed
            if update_index % config['test']['test_every_cycles'] == 0:
                is_best, best_model_global_step, best_model_test_success_rate = do_test(
                    sess, best_model_global_step, best_model_test_success_rate)
                if is_best:
                    best_model_path = best_saver.save(sess,
                                                      os.path.join(
                                                          saver_dir, 'best'),
                                                      global_step=global_step)
            if update_index % config['general']['save_model_every_cycles'] == 0:
                latest_saver.save(sess,
                                  os.path.join(saver_dir, 'last_iteration'),
                                  global_step=global_step)
            # see if max score reached (even if validation is not 100%, there will no longer be any model updates...)
            if best_model_test_success_rate > 0.99999:
                print 'stoping run: best test success rate reached {}'.format(
                    best_model_test_success_rate)
                break

        # final test at the end
        is_best, best_model_global_step, best_model_test_success_rate = do_test(
            sess, best_model_global_step, best_model_test_success_rate)
        if is_best:
            best_model_path = best_saver.save(sess,
                                              os.path.join(saver_dir, 'best'),
                                              global_step=global_step)

        # get a validation rate for the best recorded model
        validation_rate = do_end_of_run_validation(sess)

    last_message = 'best model stats at step {} has success rate of {} and validation success rate of {}'.format(
        best_model_global_step, best_model_test_success_rate, validation_rate)
    print last_message

    with open(os.path.join(completed_trajectories_dir, 'final_status.txt'),
              'w') as f:
        f.write(last_message)
        f.flush()

    test_results_file = os.path.join(completed_trajectories_dir,
                                     'test_results.test_results_pkl')
    with bz2.BZ2File(test_results_file, 'w') as compressed_file:
        pickle.dump(test_results, compressed_file)

    rollout_manager.end()
    return test_results
def main(_):
    """Run td3/ddpg training."""
    contrib_eager_python_tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    if FLAGS.expert_dir.find(FLAGS.env) == -1:
        raise ValueError('Expert directory must contain the environment name')

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    expert_replay_buffer_var = contrib_eager_python_tfe.Variable(
        '', name='expert_replay_buffer')

    saver = contrib_eager_python_tfe.Saver([expert_replay_buffer_var])
    tf.gfile.MakeDirs(FLAGS.save_dir)

    with tf.variable_scope('actor'):
        actor = Actor(obs_shape[0], act_shape[0])
    expert_saver = contrib_eager_python_tfe.Saver(actor.variables)

    best_checkpoint = None
    best_reward = float('-inf')

    checkpoint_state = tf.train.get_checkpoint_state(FLAGS.expert_dir)

    for checkpoint in checkpoint_state.all_model_checkpoint_paths:
        expert_saver.restore(checkpoint)
        expert_reward, _ = do_rollout(env,
                                      actor,
                                      replay_buffer=None,
                                      noise_scale=0.0,
                                      num_trajectories=10)

        if expert_reward > best_reward:
            best_reward = expert_reward
            best_checkpoint = checkpoint

    expert_saver.restore(best_checkpoint)

    expert_replay_buffer = ReplayBuffer()
    expert_reward, _ = do_rollout(
        env,
        actor,
        replay_buffer=expert_replay_buffer,
        noise_scale=0.0,
        num_trajectories=FLAGS.num_expert_trajectories)

    logging.info('Expert reward %f', expert_reward)
    print('Expert reward {}'.format(expert_reward))

    expert_replay_buffer_var.assign(pickle.dumps(expert_replay_buffer))
    saver.save(os.path.join(FLAGS.save_dir, 'expert_replay_buffer'))