Beispiel #1
0
 def __init__(self):
     self.args = args = agent.parse_args()
     self.ep = EnvPool(args.env, self.args.env_size)
     self.eps = [
         MultiStageEpsilon([
             LinearAnnealEpsilon(1.0, 0.1, int(1e6)),
             LinearAnnealEpsilon(0.1, 0.05, int(1e7 - 1e6))
         ]), 0
     ]
     self.replay = ReplayBuffer(args.replay_buffer_size)
     main_logger.info("Replay Buffer Max Size: {}B".format(
         pretty_num(args.replay_buffer_size * (84 * 84 * 4 * 2 + 8), True)))
     self.sess = agent.make_session()
     self.sess.__enter__()
     agent.setup(self.ep.action_num, self.replay)
     self.train_epi = 0
     self.max_reward = agent.score
Beispiel #2
0
    def __init__(
        self,
        env,
        learning_rate=1e-3,
        seed=1234,
        gamma=0.99,
        max_eps=1.0,
        min_eps=0.1,
        render=False,
        print_freq=1,
        load_path=None,
        save_path=None,
        batch_size=32,
        log_dir='logs/train',
        max_steps=100000,
        buffer_capacity=None,
        max_episode_len=None,
        eps_decay_rate=-1e-4,
        target_update_freq=1000,
    ):
        tf.random.set_seed(seed)
        np.random.seed(seed)
        self.gamma = gamma
        self.render = render
        self.batch_size = batch_size
        self.print_freq = print_freq
        self.q_lr = learning_rate
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.buffer = ReplayBuffer(buffer_capacity)
        self.max_steps = max_steps
        self.target_update = target_update_freq
        self.model = QNetwork(env.action_space.n, name='q_network')
        self.target = QNetwork(env.action_space.n, name='target_network')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
        self.summary_writer = tf.summary.create_file_writer(log_dir)
        self.env = env
        self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps
        self.rewards = []
        self.save_path = save_path

        if load_path is not None:
            self.model.load_weights(load_path)
Beispiel #3
0
    def __init__(self,
                 api,
                 network_class,
                 sess,
                 save_path,
                 history_size=15,
                 restore_path=None,
                 verbose=False,
                 train=False,
                 test=False):
        super(NeuralNetworkAgent, self).__init__(api, verbose=verbose)

        # currently 7500 w/ 1000

        # Network
        self.network = network_class(sess,
                                     save_path,
                                     restore_path=restore_path,
                                     hist_size=history_size)
        self.replay_buffer = ReplayBuffer(max_size=2500)
        self.train = train
        self.history_size = history_size

        # Internal
        self.launched = False
        self.placed_move = False
        self.ctr = 0
        self.restart_game = 1
        self.game_restarted = True
        self.show_board = False
        self.last_move = -2
        self.start_state = np.zeros((20, 10, 1))
        self.possible_moves = [-1, 0, 6, 7]
        self.training_begun = False if not test else True
        self.epsilon = 1. if not test else 0
        self.decay = 0.999
        self.test = test

        self.prev_states = [self.start_state] * self.history_size
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Beispiel #5
0
plot_episode_rewards = []  # 이건 에피소드 받은 리워드 ( 에이전트 동안 받은 개별 리워드 다 더한 값)
plot_episode_valid_steps = []  # 에피소드별 action 요청이 하나라도 들어온 step 카운트
plot_episode_count_requested_agent = np.asarray(
    [0] * N_AGENTS)  # 에이전트별 요청받은 에이전트 대수 기록
plot_episode_requested_agents = np.asarray([0] * N_AGENTS)
plot_count_per_actions = np.asarray([0] * N_ACTION)

args = get_common_args()
args = qmix_args(args)

policy = QMIX(args)
agents = Agents(args, policy)
env = elevator.ElevatorEnv(SCREEN_WIDTH, SCREEN_HEIGHT, False)

worker = RolloutWorker(env, agents, args)
buffer = ReplayBuffer(args)

plt.figure()
plt.axis([0, args.n_epoch, 0, 100])
win_rates = []
episode_rewards = []
train_steps = 0

save_path = args.result_dir + '/' + current
os.makedirs(save_path, exist_ok=True)

for epoch in range(args.n_epoch):
    episodes = []
    for e in range(args.n_episodes):
        episode, episode_reward, episode_count_per_actions, episode_episode_requested_agents, episode_episode_count_requested_agent = worker.generate_episode(
            e)
Beispiel #6
0
def main(args):
    constraints = np.array([1,0])
    
    train_data = pickle.load(open("paths.5.half.pkl", "rb"))
    train_data2 = [RLPath2(path, compute_g) for path in tqdm(train_data)]
    dataset = ReplayBuffer(10000000)
    for path in tqdm(train_data2):
        dataset.store(path)
        
    init_states = pickle.load(open("init_states606.pkl", "rb"))
    
    args = {
        "env" : "LunarLanderContinuous-v2",
        "train" : True,
        "test" : False,
        "max_iter" : 2, 
        "test_episodes" : 1,
        "output_dir" : "output",
        "output_iters" : 10,
        "gpu" : "0",
        "visualize" : False
    }
    args = Namespace(**args)
    best_response_algorithm = BestResponse(args)
    
    lambda_bound = 30
    eta = 1
    starting_lambda = [1, 100]
    online_convex_algorithm = ExponentiatedGradient(
        lambda_bound, len(constraints),
        eta=eta, starting_lambda=starting_lambda)
    
    discount = 0.95
    state_size = 8
    action_size = 2
    lr = 0.001
    fqe_epochs = 100
    fqe_batches = 3
    fitted_off_policy_evaluation_algorithm = FittedQEvaluation(discount, state_size, action_size, 
                                                               lr, epochs=fqe_epochs, batches=fqe_batches)
    
    init_seed = 606
    num_paths = 2
    exact_policy_algorithm = ExactPolicyEvaluator(discount, init_seed, num_paths, compute_g)
    
    
    problem = OptProblem(constraints, 
                         dataset, 
                         init_states, 
                         best_response_algorithm, 
                         online_convex_algorithm, 
                         fitted_off_policy_evaluation_algorithm, 
                         exact_policy_algorithm, 
                         lambda_bound, 
                         max_iterations=10)

    lambdas = []
    policies = []

    iteration = 0
    while not problem.is_over():
        iteration += 1
        for i in range(1):

            print('*' * 20)
            print('Iteration %s, %s' % (iteration, i))
            if len(lambdas) == 0:
                # first iteration
                lambdas.append(online_convex_algorithm.get())
                print('lambda_{0}_{2} = {1}'.format(iteration, lambdas[-1], i))
            else:
                # all other iterations
                lambda_t = problem.online_algo()
                lambdas.append(lambda_t)
                print('lambda_{0}_{3} = online-algo(pi_{1}_{3}) = {2}'.format(iteration, iteration-1, lambdas[-1], i))

            lambda_t = lambdas[-1]
            pi_t = problem.best_response(lambda_t)
            values = []

            # policies.append(pi_t)
            problem.update(pi_t, values, iteration)  # Evaluate C(pi_t), G(pi_t) and save
        tf.summary.scalar('agent' + str(i) + '_reward_l100_mean',
                          reward_100[i]) for i in range(3)
    ]

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run([
        agent1_actor_target_init, agent1_critic_target_init,
        agent2_actor_target_init, agent2_critic_target_init,
        agent3_actor_target_init, agent3_critic_target_init
    ])
    saver.restore(sess, './weight_single/210000.cptk')
    summary_writer = tf.summary.FileWriter('./test_three_summary',
                                           graph=tf.get_default_graph())

    agent1_memory = ReplayBuffer(100000)
    agent2_memory = ReplayBuffer(100000)
    agent3_memory = ReplayBuffer(100000)

    e = 1

    reward_100_list = [[], [], []]
    for i in range(1000000):
        if i % 1000 == 0:
            o_n = env.reset()

        agent1_action, agent2_action, agent3_action = get_agents_action(
            o_n, sess, noise_rate=0.1)

        env.render()
Beispiel #8
0
    reward_1000 = [tf.Variable(0, dtype=tf.float32) for i in range(3)]
    reward_1000_op = [tf.summary.scalar('agent' + str(i) + '_reward_l1000_mean', reward_1000[i]) for i in range(3)]

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction

    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run([agent1_actor_target_init, agent1_critic_target_init,
              agent2_actor_target_init, agent2_critic_target_init,
              agent3_actor_target_init, agent3_critic_target_init])

    summary_writer = tf.summary.FileWriter('./three_summary', graph=tf.get_default_graph())

    agent1_memory = ReplayBuffer(100000)
    agent2_memory = ReplayBuffer(100000)
    agent3_memory = ReplayBuffer(100000)

    e = 1
    
    reward_100_list = [[], [], []]
    for i in range(1000000):
        if i % 1000 == 0:
            o_n = env.reset()
            for agent_index in range(3):
                summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000)
        agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2)
        

        a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]]
def playGame():
    args = parse_args()
    args.initial_eps = 0.0001 if args.test else args.initial_eps
    if args.double:
        save_dir = "02DoubleDQN/" if not args.dueling else "02DoubleDuelingDQN/"
    else:
        save_dir = "01DQN/" if not args.dueling else "01DuelingDQN/"
    print("double:{}, dueling:{}, prioritized:{}\n".format(
        args.double, args.dueling, args.prioritized))

    sess = tf.InteractiveSession()
    # placeholders
    s = tf.placeholder("float", [None, 80, 80, 4], name="state")
    target = tf.placeholder("float", [None], name="target")
    action = tf.placeholder("float", [None, args.n_actions],
                            name="action")  # actions taken: [0, 1] or [1, 0]

    # -----dueling---------
    q_func = model(s, args.n_actions,
                   scope="q_func") if not args.dueling else dueling_model(
                       s, args.n_actions, scope="q_func")
    # -----dueling---------

    # -----double---------
    if args.double:
        q_func_vars = scope_vars("q_func")
        # target q network evaluation
        q_target = model(
            s, args.n_actions,
            scope="q_target") if not args.dueling else dueling_model(
                s, args.n_actions, scope="q_target")
        q_target_vars = scope_vars("q_target")
    # -----double---------

    # define the cost function
    readout_action = tf.reduce_sum(tf.multiply(q_func, action), axis=1)
    td_errors = target - readout_action
    cost = tf.reduce_mean(tf.square(td_errors))
    train_step = tf.train.AdamOptimizer(args.lr).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # -----prioritized replay---------
    # initialize replay memory
    if args.prioritized:
        replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size,
                                                alpha=args.prioritized_alpha)
        beta_schedule = LinearSchedule(args.prioritized_beta_iter,
                                       initial_p=args.prioritized_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(args.replay_buffer_size)
    # -----prioritized replay---------
    ''' printing
    a_file = open("logs_" + args.game + "/readout.txt", 'w')
    h_file = open("logs_" + args.game + "/hidden.txt", 'w')
    '''

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(args.n_actions)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)  # s_t : 80 * 80 * 4

    # load networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state("saved_networks/" + save_dir)
    already_trained = 0
    if checkpoint and checkpoint.model_checkpoint_path:
        already_trained = checkpoint.model_checkpoint_path
        already_trained = int(already_trained[already_trained.find('dqn-') +
                                              4:])
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    EpsilonSchedule = LinearSchedule(args.explore, args.final_eps,
                                     args.initial_eps)
    t = already_trained
    epsilon = EpsilonSchedule.value(t)
    while "flappy bird" != "angry bird":
        #-----double---------
        # whether update q_target
        if args.double and t % args.target_update_freq == 0:
            sess.run(update_target(q_func_vars, q_target_vars))
        # -----double---------

        # choose an action epsilon greedily
        Q_t = q_func.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([args.n_actions])
        action_index = 0
        if t % args.frame_per_action == 0:
            action_index = random.randrange(
                args.n_actions) if random.random() < epsilon else np.argmax(
                    Q_t)
        a_t[action_index] = 1

        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        s_t1 = preprocess(s_t, x_t1_colored)

        # store the transition in D
        replay_buffer.add(s_t, a_t, r_t, s_t1, terminal)

        # only scale down epsilon if done observing
        if t > args.observe:
            epsilon = EpsilonSchedule.value(t - args.observe)

        # only train if done observing
        if t > args.observe + already_trained:
            # -----prioritized replay---------
            # sample a minibatch to train on
            if args.prioritized:
                experience = replay_buffer.sample(
                    args.batch_size,
                    beta=beta_schedule.value(t - args.observe -
                                             already_trained))
                (s_j_batch, a_batch, r_batch, s_j1_batch, done_batch, weights,
                 batch_idxes) = experience
            else:
                s_j_batch, a_batch, r_batch, s_j1_batch, done_batch = replay_buffer.sample(
                    args.batch_size)
            # -----prioritized replay---------

            target_batch = []
            # -----double---------
            Q_j1_batch = q_target.eval(
                feed_dict={s: s_j1_batch}) if args.double else q_func.eval(
                    feed_dict={s: s_j1_batch})
            # -----double---------

            for i in range(0, args.batch_size):
                terminal = done_batch[i]
                # if terminal, only equals reward
                if terminal:
                    target_batch.append(r_batch[i])
                else:
                    target_batch.append(r_batch[i] +
                                        args.gamma * np.max(Q_j1_batch[i]))

            # -----prioritized replay---------
            if args.prioritized:
                td_errs = td_errors.eval(feed_dict={
                    target: target_batch,
                    action: a_batch,
                    s: s_j_batch
                })
                new_priorities = np.abs(td_errs) + args.prioritized_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)
            # -----prioritized replay---------

            # perform gradient step
            train_step.run(feed_dict={
                target: target_batch,
                action: a_batch,
                s: s_j_batch
            })

        # update the old values
        s_t = s_t1
        t += 1

        # save
        if t % args.save_freq == 0:
            saver.save(sess,
                       "saved_networks/" + save_dir + args.game + '-dqn',
                       global_step=t)

        # display
        if t <= args.observe:
            state = "observe"
        elif t > args.observe and t <= args.observe + args.explore:
            state = "explore"
        else:
            state = "train"
        info_expr = 'TIMESTEP:{}, STATE:{}, EPSILON:{:6f}, ACTION{}, REWARD:{}, Q_MAX:{}'
        print(
            info_expr.format(t, state, epsilon, action_index, r_t,
                             np.max(Q_t)))

        # write info to files
        '''
Beispiel #10
0
        for i in range(num_agents)
    ]

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction

    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(
        [agent_actor_target_init_list[:], agent_critic_target_init_list[:]])

    summary_writer = tf.summary.FileWriter('./VUE_summary',
                                           graph=tf.get_default_graph())

    for i in range(num_agents):
        mem = ReplayBuffer(10000)
        memory.append(mem)

    # for every 100 step, check the rewards
    reward_100_list = np.zeros([100, 1], dtype=float)
    sum_r = 0.
    for i in range(1, Episode + 1):
        print(str(i) + "번째 에피소드 시작..")
        if i % 100 == 0:
            print(str(i) + "번째 에피소드. 환경 리셋.(100 배수)")
            o_n = env.reset()
            for agent_index in range(num_agents):
                summary_writer.add_summary(
                    sess.run(
                        reward_100_op[agent_index], {
                            reward_100[agent_index]:
Beispiel #11
0
class TrainDQN:
    def __init__(self,
                 env,
                 sess,
                 learning_rate=1e-3,
                 seed=1234,
                 gamma=0.99,
                 max_eps=1.0,
                 min_eps=0.1,
                 render=False,
                 print_freq=20,
                 load_path=None,
                 save_path=None,
                 batch_size=32,
                 log_dir='logs/train',
                 max_steps=100000,
                 buffer_capacity=None,
                 max_episode_len=2000,
                 eps_decay_rate=-0.0001,
                 target_update_freq=1000,
                 ):
        """Trains an openai gym-like environment with deep q learning.
        Args:
            env: gym.Env where our agent resides
            seed: Random seed for reproducibility
            gamma: Discount factor
            max_eps: Starting exploration factor
            min_eps: Exploration factor to decay towards
            max_episode_len: Maximum length of an individual episode
            render: True to render the environment, else False
            print_freq: Displays logging information every 'print_freq' episodes
            load_path: (str) Path to load existing model from
            save_path: (str) Path to save model during training
            max_steps: maximum number of times to sample the environment
            buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store
            max_episode_len: Maximum number of timesteps in an episode
            eps_decay_rate: lambda parameter in exponential decay for epsilon
            target_update_fraction: Fraction of max_steps update the target network
        """
        np.random.seed(seed)
        self.sess = sess
        self.env = env
        self.input_dim = env.observation_space.shape[0]
        self.output_dim = env.action_space.n
        self.max_steps = max_steps
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.max_episode_len = max_episode_len
        self.render = render
        self.print_freq = print_freq
        self.rewards = []
        self.metrics = []
        self.save_path = save_path
        self.load_path = load_path
        self.batch_size = batch_size
        self.num_updates = 0
        self.gamma = gamma
        self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity)
        self.target_update_freq = target_update_freq
        self.learning_rate = learning_rate

        with tf.variable_scope('q_network'):
            self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        with tf.variable_scope('target_network'):
            self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        self.update_target_network = [old.assign(new) for (new, old) in
                                      zip(tf.trainable_variables('q_network'),
                                          tf.trainable_variables('target_network'))]
        if self.load_path is not None:
            self.load()

        self.add_summaries(log_dir)

    def add_summaries(self, log_dir):
        tf.summary.scalar('Loss', self.q_network.loss, )
        tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred))
        # Merge all the summaries and write them out to log_dir
        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph)

    def learn(self):
        """Learns via Deep-Q-Networks (DQN)"""
        obs = self.env.reset()
        mean_reward = None
        total_reward = 0
        ep = 0
        ep_len = 0
        rand_actions = 0
        for t in range(self.max_steps):
            # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
            eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp(
                self.eps_decay_rate * t)
            if self.render:
                self.env.render()

            # Take exploratory action with probability epsilon
            if np.random.uniform() < eps:
                action = self.env.action_space.sample()
                rand_actions += 1
            else:
                action = self.act(obs)

            # Execute action in emulator and observe reward and next state
            new_obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # Store transition s_t, a_t, r_t, s_t+1 in replay buffer
            self.buffer.add((obs, action, reward, new_obs, done))

            # Perform learning step
            self.update()

            obs = new_obs
            ep_len += 1
            if done or ep_len >= self.max_episode_len:
                #         print("Episode Length:", ep_len)
                #         print(f"Episode {ep} Reward:{total_reward}")
                #         print(f"Random Action Percent: {rand_actions/ep_len}")
                ep += 1
                ep_len = 0
                rand_actions = 0
                self.rewards.append(total_reward)
                total_reward = 0
                obs = self.env.reset()

                if ep % self.print_freq == 0 and ep > 0:
                    new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:])

                    print(f"-------------------------------------------------------")
                    print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}")
                    print(f"Exploration fraction: {eps}")
                    print(f"Total Episodes: {ep}")
                    print(f"Total timesteps: {t}")
                    print(f"-------------------------------------------------------")

                    # Add reward summary
                    summary = tf.Summary()
                    summary.value.add(tag=f'Mean {self.print_freq} Episode Reward',
                                      simple_value=new_mean_reward)
                    summary.value.add(tag=f'Epsilon', simple_value=eps)
                    self.train_writer.add_summary(summary, self.num_updates)

                    # Model saving inspired by Open AI Baseline implementation
                    if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None:
                        print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}")
                        print(f'Location: {self.save_path}')
                        # save_path = f"{self.save_path}_model"
                        self.save()
                        mean_reward = new_mean_reward

    def act(self, observation):
        """Takes an action given the observation.
        Args:
            observation: observation from the environment
        Returns:
            integer index of the selected action
        """
        pred = self.sess.run([self.q_network.output_pred],
                             feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))})
        return np.argmax(pred)

    def update(self):
        """Applies gradients to the Q network computed from a minibatch of self.batch_size."""
        if self.batch_size <= self.buffer.size():
            self.num_updates += 1

            # Update the Q network with model parameters from the target network
            if self.num_updates % self.target_update_freq == 0:
                self.sess.run(self.update_target_network)
                print('Updated Target Network')

            # Sample random minibatch of transitions from the replay buffer
            sample = self.buffer.sample(self.batch_size)
            states, action, reward, next_states, done = sample

            # Calculate discounted predictions for the subsequent states using target network
            next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred,
                                                         feed_dict={
                                                             self.target_network.input_ph: next_states}, )

            # Adjust the targets for non-terminal states
            reward = reward.reshape(len(reward), 1)
            targets = reward
            loc = np.argwhere(done != True).flatten()
            if len(loc) > 0:
                max_q = np.amax(next_state_pred, axis=1)
                targets[loc] = np.add(
                    targets[loc],
                    max_q[loc].reshape(max_q[loc].shape[0], 1),
                    casting='unsafe')

            # Update discount factor and train model on batch
            _, loss = self.sess.run([self.q_network.opt, self.q_network.loss],
                                    feed_dict={self.q_network.input_ph: states,
                                               self.q_network.target_ph: targets.flatten(),
                                               self.q_network.action_indices_ph: action})

    def save(self):
        """Saves the Q network."""
        self.q_network.saver.save(self.sess, self.save_path)

    def load(self):
        """Loads the Q network."""
        self.q_network.saver.restore(self.sess, self.save_path)

    def plot_rewards(self, path=None):
        """Plots rewards per episode.
        Args:
            path: Location to save the rewards plot. If None, image will be displayed with plt.show()
        """
        plt.plot(self.rewards)
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        if path is None:
            plt.show()
        else:
            plt.savefig(path)
            plt.close('all')
Beispiel #12
0
    reward_1000_op = [tf.summary.scalar('agent' + str(i) + '_reward_l1000_mean', reward_1000[i]) for i in range(3)]

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction

    sess = tf.Session(config=config)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run([agent1_actor_target_init, agent1_critic_target_init,
              agent2_actor_target_init, agent2_critic_target_init,
              agent3_actor_target_init, agent3_critic_target_init])

    summary_writer = tf.summary.FileWriter('./three_ma_summary', graph=tf.get_default_graph())

    agent1_memory = ReplayBuffer(100000)
    agent2_memory = ReplayBuffer(100000)
    agent3_memory = ReplayBuffer(100000)

    # e = 1
    
    reward_100_list = [[], [], []]
    for i in range(1000000):
        if i % 1000 == 0:
            o_n = env.reset()
            for agent_index in range(3):
                summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000)

        agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2)

        a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]]
Beispiel #13
0
class NeuralNetworkAgent(Agent):
    def __init__(self,
                 api,
                 network_class,
                 sess,
                 save_path,
                 history_size=15,
                 restore_path=None,
                 verbose=False,
                 train=False,
                 test=False):
        super(NeuralNetworkAgent, self).__init__(api, verbose=verbose)

        # currently 7500 w/ 1000

        # Network
        self.network = network_class(sess,
                                     save_path,
                                     restore_path=restore_path,
                                     hist_size=history_size)
        self.replay_buffer = ReplayBuffer(max_size=2500)
        self.train = train
        self.history_size = history_size

        # Internal
        self.launched = False
        self.placed_move = False
        self.ctr = 0
        self.restart_game = 1
        self.game_restarted = True
        self.show_board = False
        self.last_move = -2
        self.start_state = np.zeros((20, 10, 1))
        self.possible_moves = [-1, 0, 6, 7]
        self.training_begun = False if not test else True
        self.epsilon = 1. if not test else 0
        self.decay = 0.999
        self.test = test

        self.prev_states = [self.start_state] * self.history_size

    def _controller_listener(self):
        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        if piece_id != 19 and game_state == 1:
            # Train
            if self.train and self.replay_buffer.size(
            ) > 250 and not self.test:
                batch = self.replay_buffer.sample(batch_sz=250)
                self.network.train(batch)
                self.training_begun = True

                self.epsilon *= self.decay
                if self.epsilon < 0.010:
                    self.epsilon = 0.010

        if not self.placed_move:  # and (random_move >= 0 or self.restart_game > 0):
            # os.system('clear')
            print '--------------'
            is_random = False
            move = None
            if np.random.random() < self.epsilon or not self.training_begun:
                move = np.random.choice(self.possible_moves)
                is_random = True
            else:
                tensor = np.dstack([self.grid] + self.prev_states)
                pred = self.network.predict(tensor)[0]
                move = self.possible_moves[pred]

            if self.restart_game > 0:
                self.api.writeGamepad(0, 3, True)
                self.restart_game -= 1
                move = -2
            else:
                if move >= 0:
                    self.api.writeGamepad(0, move, True)
            self.placed_move = True
            self.show_board = True

            if self.last_move != -2 and piece_id != 19:
                print 'Random:', is_random
                S = self.grid.copy()
                self._update_board(self.api.peekCPU(0x0042))
                board = self._simulate_piece_drop(self.api.peekCPU(0x0042))
                n_empty = self._count_empty(self.grid)
                n_holes = self._count_holes(self.grid)
                height = self._count_height(board)
                levelness = self._determine_levelness(board)
                A = self.last_move
                # R  = self._count_total() + self._get_score() - n_empty
                #R = (-50 * height) + (-20 * n_holes) + (self._get_score())
                if height <= 2:
                    R = 1000
                else:
                    R = -200 * height
                R += -20 * n_holes + 10 * levelness  # 10 * self._get_score()
                SP = self.grid.copy()

                self.prev_states.insert(0, S)

                print np.dstack(self.prev_states).shape

                self.replay_buffer.add(
                    np.dstack(self.prev_states), self.possible_moves.index(A),
                    R, np.dstack([SP] + self.prev_states[:self.history_size]))

                self.prev_states = self.prev_states[:self.history_size]

                print self.epsilon
                self._print_transition(S, A, board, R)

            self.last_move = move
        else:
            self.placed_move = False

    def _frame_render_finished(self):
        """
        Renders the board the the current piece
        TODO: do this lazily, so we aren't calling read too often O_o
        """

        # To make things easier, we're going to modify the next piece drop
        # Always drop a certain type of block (currently square).
        self.api.writeCPU(0x00bf, 0x0a)

        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        # Restart the game
        if piece_id == 19 and (game_state == 10 or game_state == 0):
            self.prev_states = [self.start_state] * self.history_size
            self.game_restarted = True
            self.restart_game = 1
            return

        # Probably a line clear... Skip
        if piece_id == 19 and game_state != 1:
            return

    def _piece_update(self, access_type, address, value):
        """
        Can be used to control the piece being dropped
        """
        if self.api.readCPU(0x0048) == 1:
            return 0x0a
        return value

    def agent_name(self):
        return 'NeuralNetworkAgent'
Beispiel #14
0
class DQN:
    def __init__(
        self,
        env,
        learning_rate=1e-3,
        seed=1234,
        gamma=0.99,
        max_eps=1.0,
        min_eps=0.1,
        render=False,
        print_freq=1,
        load_path=None,
        save_path=None,
        batch_size=32,
        log_dir='logs/train',
        max_steps=100000,
        buffer_capacity=None,
        max_episode_len=None,
        eps_decay_rate=-1e-4,
        target_update_freq=1000,
    ):
        tf.random.set_seed(seed)
        np.random.seed(seed)
        self.gamma = gamma
        self.render = render
        self.batch_size = batch_size
        self.print_freq = print_freq
        self.q_lr = learning_rate
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.buffer = ReplayBuffer(buffer_capacity)
        self.max_steps = max_steps
        self.target_update = target_update_freq
        self.model = QNetwork(env.action_space.n, name='q_network')
        self.target = QNetwork(env.action_space.n, name='target_network')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
        self.summary_writer = tf.summary.create_file_writer(log_dir)
        self.env = env
        self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps
        self.rewards = []
        self.save_path = save_path

        if load_path is not None:
            self.model.load_weights(load_path)

    def act(self, state):
        return np.argmax(self.model(state))

    @tf.function
    def train_step(self, states, indices, targets):
        """
        Performs a single step of gradient descent on the Q network

        Args:
            states: numpy array of states with shape (batch size, state dim)
            indices: list indices of the selected actions
            targets: targets for computing the MSE loss

        """
        with tf.GradientTape() as tape:
            action_values = tf.gather_nd(self.model(states), indices)
            mse_loss = tf.keras.losses.MeanSquaredError()(action_values,
                                                          targets)

        gradients = tape.gradient(mse_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.model.trainable_variables))

        # Log training information
        with self.summary_writer.as_default():
            tf.summary.scalar('MSE Loss',
                              mse_loss,
                              step=self.optimizer.iterations)
            tf.summary.scalar('Estimated Q Value',
                              tf.reduce_mean(action_values),
                              step=self.optimizer.iterations)

    def update(self):
        """
        Computes the target for the MSE loss and calls the tf.function for gradient descent
        """
        if len(self.buffer) >= self.batch_size:
            # Sample random minibatch of N transitions
            states, actions, rewards, next_states, dones = self.buffer.sample(
                self.batch_size)

            # Adjust the targets for non-terminal states
            next_state_pred = self.target(next_states)
            targets = rewards + self.gamma * next_state_pred.numpy().max(
                axis=1) * (1 - dones)
            batch_range = tf.range(start=0, limit=actions.shape[0])
            indices = tf.stack((batch_range, actions), axis=1)

            # update critic by minimizing the MSE loss
            self.train_step(states, indices, targets)

    def learn(self):
        """Learns via Deep-Q-Networks (DQN)"""
        obs = self.env.reset()
        total_reward = 0
        ep = 0
        ep_len = 0
        rand_actions = 0
        mean_reward = None
        for t in range(self.max_steps):

            if t % self.target_update == 0:
                copy_weights(self.model.variables, self.target.variables)

            # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
            eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp(
                self.eps_decay_rate * t)
            if self.render:
                self.env.render()

            # Take exploratory action with probability epsilon
            if np.random.uniform() < eps:
                action = self.env.action_space.sample()
                rand_actions += 1
            else:
                action = self.act(np.expand_dims(obs, axis=0))

            # Execute action in emulator and observe reward and next state
            new_obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # Store transition s_t, a_t, r_t, s_t+1 in replay buffer
            self.buffer.add((obs, action, reward, new_obs, done))

            # Perform learning step
            self.update()

            obs = new_obs
            ep_len += 1
            if done or ep_len >= self.max_episode_len:
                with self.summary_writer.as_default():
                    ep += 1
                    self.rewards.append(total_reward)
                    total_reward = 0
                    obs = self.env.reset()

                    if ep % self.print_freq == 0 and ep > 0:
                        new_mean_reward = np.mean(
                            self.rewards[-self.print_freq - 1:])

                        print(
                            f"-------------------------------------------------------"
                        )
                        print(
                            f"Mean {self.print_freq} Episode Reward: {new_mean_reward}"
                        )
                        print(f"Exploration fraction: {rand_actions / ep_len}")
                        print(f"Total Episodes: {ep}")
                        print(f"Total timesteps: {t}")
                        print(
                            f"-------------------------------------------------------"
                        )

                        tf.summary.scalar(
                            f'Mean {self.print_freq} Episode Reward',
                            new_mean_reward,
                            step=t)
                        tf.summary.scalar(f'Epsilon', eps, step=t)

                        # Model saving inspired by Open AI Baseline implementation
                        if (mean_reward is None or new_mean_reward >=
                                mean_reward) and self.save_path is not None:
                            print(
                                f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}"
                            )
                            print(f'Location: {self.save_path}')
                            mean_reward = new_mean_reward
                            self.model.save_weights(self.save_path)

                    ep_len = 0
                    rand_actions = 0
def play(train_indicator):
    buffer_size = 100000
    batch_size = 32
    gamma = 0.99  # discount factor
    tau = 0.001  # Target Network HyperParameter
    lra = 0.0001  # Learning rate for Actor
    lrc = 0.001  # Learning rate for Critic
    ou_sigma = 0.3

    action_dim = 1  # Steering angle
    state_dim = 21  # num of sensors input

    episodes_num = 2000
    max_steps = 100000
    step = 0

    train_stat_file = "data/train_stat.txt"
    actor_weights_file = "data/actor.h5"
    critic_weights_file = "data/critic.h5"

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    tf_session = tf.Session(config=config)

    keras_backend.set_session(tf_session)

    actor = ActorNetwork(tf_session=tf_session,
                         state_size=state_dim,
                         action_size=action_dim,
                         hidden_units=(300, 600),
                         tau=tau,
                         lr=lra)
    critic = CriticNetwork(tf_session=tf_session,
                           state_size=state_dim,
                           action_size=action_dim,
                           hidden_units=(300, 600),
                           tau=tau,
                           lr=lrc)
    buffer = ReplayBuffer(buffer_size)

    # noise function for exploration
    ou = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim),
                                      sigma=ou_sigma * np.ones(action_dim))

    # Torcs environment - throttle and gear change controlled by client
    env = TorcsEnv(vision=False, throttle=False, gear_change=False)

    try:
        actor.model.load_weights(actor_weights_file)
        critic.model.load_weights(critic_weights_file)
        actor.target_model.load_weights(actor_weights_file)
        critic.target_model.load_weights(critic_weights_file)
        print("Weights loaded successfully")
    except:
        print("Cannot load weights")

    for i in range(episodes_num):
        print("Episode : %s Replay buffer %s" % (i, len(buffer)))

        if i % 3 == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        # 21 len state dimensions - https://arxiv.org/abs/1304.1672
        state = np.hstack((ob.angle, ob.track, ob.trackPos))

        total_reward = 0.
        for j in range(max_steps):
            loss = 0

            action_predicted = actor.model.predict(
                state.reshape(1,
                              state.shape[0])) + ou()  # predict and add noise

            observation, reward, done, info = env.step(action_predicted[0])

            state1 = np.hstack(
                (observation.angle, observation.track, observation.trackPos))

            buffer.add((state, action_predicted[0], reward, state1,
                        done))  # add replay buffer

            # batch update
            batch = buffer.get_batch(batch_size)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + gamma * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.get_gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.train_target_model()
                critic.train_target_model()

            total_reward += reward
            state = state1

            print("Episode %s - Step %s - Action %s - Reward %s" %
                  (i, step, action_predicted[0][0], reward))

            step += 1
            if done:
                break

        if i % 3 == 0 and train_indicator:
            print("Saving weights...")
            actor.model.save_weights(actor_weights_file, overwrite=True)
            critic.model.save_weights(critic_weights_file, overwrite=True)

        tm = time.strftime("%Y-%m-%d %H:%M:%S")
        episode_stat = "%s -th Episode. %s total steps. Total reward: %s. Time %s" % (
            i, step, total_reward, tm)
        print(episode_stat)
        with open(train_stat_file, "a") as outfile:
            outfile.write(episode_stat + "\n")

    env.end()
Beispiel #16
0
class Game(object):
    def __init__(self):
        self.args = args = agent.parse_args()
        self.ep = EnvPool(args.env, self.args.env_size)
        self.eps = [
            MultiStageEpsilon([
                LinearAnnealEpsilon(1.0, 0.1, int(1e6)),
                LinearAnnealEpsilon(0.1, 0.05, int(1e7 - 1e6))
            ]), 0
        ]
        self.replay = ReplayBuffer(args.replay_buffer_size)
        main_logger.info("Replay Buffer Max Size: {}B".format(
            pretty_num(args.replay_buffer_size * (84 * 84 * 4 * 2 + 8), True)))
        self.sess = agent.make_session()
        self.sess.__enter__()
        agent.setup(self.ep.action_num, self.replay)
        self.train_epi = 0
        self.max_reward = agent.score

    def random(self):
        random_step = self.args.replay_buffer_size // 2
        obs = self.ep.reset()
        with tqdm(total=random_step, desc="random", ascii=True) as t:
            while t.n < random_step:
                action, (obs_, reward, done, info) = self.ep.random()
                [
                    self.replay.add(obs[i], action[i], reward[i],
                                    float(done[i]), obs_[i])
                    for i in range(self.ep.size)
                ]
                obs, info = self.ep.auto_reset()
                t.update(self.ep.size)
        total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size))
        mean_reward = np.mean([
            np.mean(info[i]['rewards']) for i in range(self.ep.size)
            if info[i]['rewards']
        ])
        record = Record()
        record.add_key_value('Phase', 'Random')
        record.add_key_value('Episodes', pretty_num(total_epi))
        record.add_key_value('Mean Reward', np.round(mean_reward, 2))
        main_logger.info("\n" + record.dumps())
        if not self.max_reward:
            self.max_reward = mean_reward

    def train(self):
        train_step = 250000
        self.ep.reset_state()
        obs = self.ep.reset()
        with tqdm(total=train_step, desc="Train", ascii=True) as t:
            while t.n < train_step:
                action = agent.take_action(
                    obs, self.eps[0].get(self.train_epi * train_step + t.n))
                obs_, reward, done, info = self.ep.step(action)
                [
                    self.replay.add(obs[i], action[i], reward[i],
                                    float(done[i]), obs_[i])
                    for i in range(self.ep.size)
                ]
                obs, info = self.ep.auto_reset()
                if t.n % self.args.target_update_freq == 0:
                    agent.update_target()
                if t.n % self.args.learning_freq == 0:
                    agent.train(self.ep.size)
                t.update(self.ep.size)
        self.train_epi += 1
        completion = np.round(self.train_epi / self.args.num_iters, 2)
        total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size))
        mean_reward = np.mean([
            np.mean(info[i]['rewards'][-100:]) for i in range(self.ep.size)
            if info[i]['rewards']
        ])
        record = Record()
        record.add_key_value('Phase', 'Train')
        record.add_key_value('% Completion', completion)
        record.add_key_value('Episodes', pretty_num(total_epi))
        record.add_key_value(
            '% Exploration',
            np.round(self.eps[0].get(self.train_epi * train_step) * 100, 2))
        record.add_key_value('Reward (100 epi mean)', np.round(mean_reward, 2))
        main_logger.info("\n" + record.dumps())

    def test(self):
        test_step = 200000
        self.ep.reset_state()
        obs = self.ep.reset()
        with tqdm(total=test_step, desc="Evaluation", ascii=True) as t:
            while t.n < test_step:
                action = agent.take_action(obs, self.eps[1])
                self.ep.step(action)
                obs, info = self.ep.auto_reset()
                t.update(self.ep.size)
        total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size))
        mean_reward = np.mean([
            np.mean(info[i]['rewards']) for i in range(self.ep.size)
            if info[i]['rewards']
        ])
        record = Record()
        record.add_key_value('Phase', 'Evaluation')
        record.add_key_value('Episodes', pretty_num(total_epi))
        record.add_key_value('Mean Reward', np.round(mean_reward, 2))
        main_logger.info("\n" + record.dumps())
        if self.max_reward < mean_reward:
            self.max_reward = mean_reward
            agent.score = mean_reward
            agent.save_model()

    def run(self):
        self.random()
        for i in range(self.args.num_iters):
            self.train()
            self.test()
        self.exit()

    def exit(self):
        self.ep.close()
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # get targets
        self.qnetwork_target.eval()
        with torch.no_grad():
            Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0]

        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # get outputs
        self.qnetwork_local.train()
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)

        # compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # clear gradients
        self.optimizer.zero_grad()

        # update weights local network
        loss.backward()

        # take one SGD step
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Beispiel #18
0
    def __init__(self,
                 env,
                 sess,
                 learning_rate=1e-3,
                 seed=1234,
                 gamma=0.99,
                 max_eps=1.0,
                 min_eps=0.1,
                 render=False,
                 print_freq=20,
                 load_path=None,
                 save_path=None,
                 batch_size=32,
                 log_dir='logs/train',
                 max_steps=100000,
                 buffer_capacity=None,
                 max_episode_len=2000,
                 eps_decay_rate=-0.0001,
                 target_update_freq=1000,
                 ):
        """Trains an openai gym-like environment with deep q learning.
        Args:
            env: gym.Env where our agent resides
            seed: Random seed for reproducibility
            gamma: Discount factor
            max_eps: Starting exploration factor
            min_eps: Exploration factor to decay towards
            max_episode_len: Maximum length of an individual episode
            render: True to render the environment, else False
            print_freq: Displays logging information every 'print_freq' episodes
            load_path: (str) Path to load existing model from
            save_path: (str) Path to save model during training
            max_steps: maximum number of times to sample the environment
            buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store
            max_episode_len: Maximum number of timesteps in an episode
            eps_decay_rate: lambda parameter in exponential decay for epsilon
            target_update_fraction: Fraction of max_steps update the target network
        """
        np.random.seed(seed)
        self.sess = sess
        self.env = env
        self.input_dim = env.observation_space.shape[0]
        self.output_dim = env.action_space.n
        self.max_steps = max_steps
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.max_episode_len = max_episode_len
        self.render = render
        self.print_freq = print_freq
        self.rewards = []
        self.metrics = []
        self.save_path = save_path
        self.load_path = load_path
        self.batch_size = batch_size
        self.num_updates = 0
        self.gamma = gamma
        self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity)
        self.target_update_freq = target_update_freq
        self.learning_rate = learning_rate

        with tf.variable_scope('q_network'):
            self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        with tf.variable_scope('target_network'):
            self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        self.update_target_network = [old.assign(new) for (new, old) in
                                      zip(tf.trainable_variables('q_network'),
                                          tf.trainable_variables('target_network'))]
        if self.load_path is not None:
            self.load()

        self.add_summaries(log_dir)
Beispiel #19
0
def train(conf,
          env,
          model,
          num_episodes=500,
          batch_size=100,
          buffer_size=10000):
    conf.buffer_size = buffer_size
    conf.batch_size = batch_size

    replay_buffer = ReplayBuffer(size=buffer_size)
    discount_rate = conf.discount_rate
    eps = conf.initial_eps
    decay_factor = conf.decay_factor
    for episode in range(num_episodes):
        print("Episode {}".format(episode))
        observation = env.reset()
        eps *= decay_factor
        done = False
        total_food = 0
        step = 0
        while not done:
            model_input = np.array([observation])
            prediction = model.predict(model_input)
            if np.random.random() < eps:
                action = np.random.randint(0, 4)
                was_random = True
            else:
                action = np.argmax(prediction)
                was_random = False

            debugger.print_step_before_move(step, observation, prediction,
                                            action, was_random)

            debugger.render_env_until_key_press(env)

            new_observation, reward, done, _ = env.step(action)

            replay_buffer.add(observation, action, reward, new_observation,
                              float(done))

            # target_action_score = reward + (0 if done else discount_rate * np.max(model.predict(
            #     np.array([new_observation]))))

            # label = prediction
            # label[0][action] = target_action_score
            # model.fit(model_input, label, epochs=1,
            #           verbose=0)

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            labels = model.predict(obses_t)
            targets = discount_rate * np.max(model.predict(obses_tp1), axis=1)
            # print('targets', targets)
            # print('rewards', rewards)
            for i in range(len(dones)):
                if dones[i]:
                    targets[i] = 0
                targets[i] += rewards[i]
                labels[i][actions[i]] = targets[i]
            model.fit(obses_t, labels, epochs=1, verbose=0)

            weights, batch_idxes = np.ones_like(rewards), None

            # debugger.print_step_after_move(reward, target_action_score,
            #                       label, model.predict(model_input))

            if (reward > 0):
                total_food += 1
            step += 1

            observation = new_observation
        wandb.log({
            'episode': episode,
            'total_food': total_food,
            'eps': eps,
            'lifetime': step
        })
        print('Score: {}'.format(total_food))
        print()
    env.close()