Ejemplo n.º 1
0
 def __init__(self):
     self.args = args = agent.parse_args()
     self.ep = EnvPool(args.env, self.args.env_size)
     self.eps = [
         MultiStageEpsilon([
             LinearAnnealEpsilon(1.0, 0.1, int(1e6)),
             LinearAnnealEpsilon(0.1, 0.05, int(1e7 - 1e6))
         ]), 0
     ]
     self.replay = ReplayBuffer(args.replay_buffer_size)
     main_logger.info("Replay Buffer Max Size: {}B".format(
         pretty_num(args.replay_buffer_size * (84 * 84 * 4 * 2 + 8), True)))
     self.sess = agent.make_session()
     self.sess.__enter__()
     agent.setup(self.ep.action_num, self.replay)
     self.train_epi = 0
     self.max_reward = agent.score
Ejemplo n.º 2
0
    def __init__(
        self,
        env,
        learning_rate=1e-3,
        seed=1234,
        gamma=0.99,
        max_eps=1.0,
        min_eps=0.1,
        render=False,
        print_freq=1,
        load_path=None,
        save_path=None,
        batch_size=32,
        log_dir='logs/train',
        max_steps=100000,
        buffer_capacity=None,
        max_episode_len=None,
        eps_decay_rate=-1e-4,
        target_update_freq=1000,
    ):
        tf.random.set_seed(seed)
        np.random.seed(seed)
        self.gamma = gamma
        self.render = render
        self.batch_size = batch_size
        self.print_freq = print_freq
        self.q_lr = learning_rate
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.buffer = ReplayBuffer(buffer_capacity)
        self.max_steps = max_steps
        self.target_update = target_update_freq
        self.model = QNetwork(env.action_space.n, name='q_network')
        self.target = QNetwork(env.action_space.n, name='target_network')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
        self.summary_writer = tf.summary.create_file_writer(log_dir)
        self.env = env
        self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps
        self.rewards = []
        self.save_path = save_path

        if load_path is not None:
            self.model.load_weights(load_path)
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Ejemplo n.º 4
0
    def __init__(self,
                 api,
                 network_class,
                 sess,
                 save_path,
                 history_size=15,
                 restore_path=None,
                 verbose=False,
                 train=False,
                 test=False):
        super(NeuralNetworkAgent, self).__init__(api, verbose=verbose)

        # currently 7500 w/ 1000

        # Network
        self.network = network_class(sess,
                                     save_path,
                                     restore_path=restore_path,
                                     hist_size=history_size)
        self.replay_buffer = ReplayBuffer(max_size=2500)
        self.train = train
        self.history_size = history_size

        # Internal
        self.launched = False
        self.placed_move = False
        self.ctr = 0
        self.restart_game = 1
        self.game_restarted = True
        self.show_board = False
        self.last_move = -2
        self.start_state = np.zeros((20, 10, 1))
        self.possible_moves = [-1, 0, 6, 7]
        self.training_begun = False if not test else True
        self.epsilon = 1. if not test else 0
        self.decay = 0.999
        self.test = test

        self.prev_states = [self.start_state] * self.history_size
Ejemplo n.º 5
0
    def __init__(self,
                 env,
                 sess,
                 learning_rate=1e-3,
                 seed=1234,
                 gamma=0.99,
                 max_eps=1.0,
                 min_eps=0.1,
                 render=False,
                 print_freq=20,
                 load_path=None,
                 save_path=None,
                 batch_size=32,
                 log_dir='logs/train',
                 max_steps=100000,
                 buffer_capacity=None,
                 max_episode_len=2000,
                 eps_decay_rate=-0.0001,
                 target_update_freq=1000,
                 ):
        """Trains an openai gym-like environment with deep q learning.
        Args:
            env: gym.Env where our agent resides
            seed: Random seed for reproducibility
            gamma: Discount factor
            max_eps: Starting exploration factor
            min_eps: Exploration factor to decay towards
            max_episode_len: Maximum length of an individual episode
            render: True to render the environment, else False
            print_freq: Displays logging information every 'print_freq' episodes
            load_path: (str) Path to load existing model from
            save_path: (str) Path to save model during training
            max_steps: maximum number of times to sample the environment
            buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store
            max_episode_len: Maximum number of timesteps in an episode
            eps_decay_rate: lambda parameter in exponential decay for epsilon
            target_update_fraction: Fraction of max_steps update the target network
        """
        np.random.seed(seed)
        self.sess = sess
        self.env = env
        self.input_dim = env.observation_space.shape[0]
        self.output_dim = env.action_space.n
        self.max_steps = max_steps
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.max_episode_len = max_episode_len
        self.render = render
        self.print_freq = print_freq
        self.rewards = []
        self.metrics = []
        self.save_path = save_path
        self.load_path = load_path
        self.batch_size = batch_size
        self.num_updates = 0
        self.gamma = gamma
        self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity)
        self.target_update_freq = target_update_freq
        self.learning_rate = learning_rate

        with tf.variable_scope('q_network'):
            self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        with tf.variable_scope('target_network'):
            self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        self.update_target_network = [old.assign(new) for (new, old) in
                                      zip(tf.trainable_variables('q_network'),
                                          tf.trainable_variables('target_network'))]
        if self.load_path is not None:
            self.load()

        self.add_summaries(log_dir)
Ejemplo n.º 6
0
plot_episode_rewards = []  # 이건 에피소드 받은 리워드 ( 에이전트 동안 받은 개별 리워드 다 더한 값)
plot_episode_valid_steps = []  # 에피소드별 action 요청이 하나라도 들어온 step 카운트
plot_episode_count_requested_agent = np.asarray(
    [0] * N_AGENTS)  # 에이전트별 요청받은 에이전트 대수 기록
plot_episode_requested_agents = np.asarray([0] * N_AGENTS)
plot_count_per_actions = np.asarray([0] * N_ACTION)

args = get_common_args()
args = qmix_args(args)

policy = QMIX(args)
agents = Agents(args, policy)
env = elevator.ElevatorEnv(SCREEN_WIDTH, SCREEN_HEIGHT, False)

worker = RolloutWorker(env, agents, args)
buffer = ReplayBuffer(args)

plt.figure()
plt.axis([0, args.n_epoch, 0, 100])
win_rates = []
episode_rewards = []
train_steps = 0

save_path = args.result_dir + '/' + current
os.makedirs(save_path, exist_ok=True)

for epoch in range(args.n_epoch):
    episodes = []
    for e in range(args.n_episodes):
        episode, episode_reward, episode_count_per_actions, episode_episode_requested_agents, episode_episode_count_requested_agent = worker.generate_episode(
            e)
Ejemplo n.º 7
0
def main(args):
    constraints = np.array([1,0])
    
    train_data = pickle.load(open("paths.5.half.pkl", "rb"))
    train_data2 = [RLPath2(path, compute_g) for path in tqdm(train_data)]
    dataset = ReplayBuffer(10000000)
    for path in tqdm(train_data2):
        dataset.store(path)
        
    init_states = pickle.load(open("init_states606.pkl", "rb"))
    
    args = {
        "env" : "LunarLanderContinuous-v2",
        "train" : True,
        "test" : False,
        "max_iter" : 2, 
        "test_episodes" : 1,
        "output_dir" : "output",
        "output_iters" : 10,
        "gpu" : "0",
        "visualize" : False
    }
    args = Namespace(**args)
    best_response_algorithm = BestResponse(args)
    
    lambda_bound = 30
    eta = 1
    starting_lambda = [1, 100]
    online_convex_algorithm = ExponentiatedGradient(
        lambda_bound, len(constraints),
        eta=eta, starting_lambda=starting_lambda)
    
    discount = 0.95
    state_size = 8
    action_size = 2
    lr = 0.001
    fqe_epochs = 100
    fqe_batches = 3
    fitted_off_policy_evaluation_algorithm = FittedQEvaluation(discount, state_size, action_size, 
                                                               lr, epochs=fqe_epochs, batches=fqe_batches)
    
    init_seed = 606
    num_paths = 2
    exact_policy_algorithm = ExactPolicyEvaluator(discount, init_seed, num_paths, compute_g)
    
    
    problem = OptProblem(constraints, 
                         dataset, 
                         init_states, 
                         best_response_algorithm, 
                         online_convex_algorithm, 
                         fitted_off_policy_evaluation_algorithm, 
                         exact_policy_algorithm, 
                         lambda_bound, 
                         max_iterations=10)

    lambdas = []
    policies = []

    iteration = 0
    while not problem.is_over():
        iteration += 1
        for i in range(1):

            print('*' * 20)
            print('Iteration %s, %s' % (iteration, i))
            if len(lambdas) == 0:
                # first iteration
                lambdas.append(online_convex_algorithm.get())
                print('lambda_{0}_{2} = {1}'.format(iteration, lambdas[-1], i))
            else:
                # all other iterations
                lambda_t = problem.online_algo()
                lambdas.append(lambda_t)
                print('lambda_{0}_{3} = online-algo(pi_{1}_{3}) = {2}'.format(iteration, iteration-1, lambdas[-1], i))

            lambda_t = lambdas[-1]
            pi_t = problem.best_response(lambda_t)
            values = []

            # policies.append(pi_t)
            problem.update(pi_t, values, iteration)  # Evaluate C(pi_t), G(pi_t) and save
Ejemplo n.º 8
0
        tf.summary.scalar('agent' + str(i) + '_reward_l100_mean',
                          reward_100[i]) for i in range(3)
    ]

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run([
        agent1_actor_target_init, agent1_critic_target_init,
        agent2_actor_target_init, agent2_critic_target_init,
        agent3_actor_target_init, agent3_critic_target_init
    ])
    saver.restore(sess, './weight_single/210000.cptk')
    summary_writer = tf.summary.FileWriter('./test_three_summary',
                                           graph=tf.get_default_graph())

    agent1_memory = ReplayBuffer(100000)
    agent2_memory = ReplayBuffer(100000)
    agent3_memory = ReplayBuffer(100000)

    e = 1

    reward_100_list = [[], [], []]
    for i in range(1000000):
        if i % 1000 == 0:
            o_n = env.reset()

        agent1_action, agent2_action, agent3_action = get_agents_action(
            o_n, sess, noise_rate=0.1)

        env.render()
Ejemplo n.º 9
0
def playGame():
    args = parse_args()
    args.initial_eps = 0.0001 if args.test else args.initial_eps
    if args.double:
        save_dir = "02DoubleDQN/" if not args.dueling else "02DoubleDuelingDQN/"
    else:
        save_dir = "01DQN/" if not args.dueling else "01DuelingDQN/"
    print("double:{}, dueling:{}, prioritized:{}\n".format(
        args.double, args.dueling, args.prioritized))

    sess = tf.InteractiveSession()
    # placeholders
    s = tf.placeholder("float", [None, 80, 80, 4], name="state")
    target = tf.placeholder("float", [None], name="target")
    action = tf.placeholder("float", [None, args.n_actions],
                            name="action")  # actions taken: [0, 1] or [1, 0]

    # -----dueling---------
    q_func = model(s, args.n_actions,
                   scope="q_func") if not args.dueling else dueling_model(
                       s, args.n_actions, scope="q_func")
    # -----dueling---------

    # -----double---------
    if args.double:
        q_func_vars = scope_vars("q_func")
        # target q network evaluation
        q_target = model(
            s, args.n_actions,
            scope="q_target") if not args.dueling else dueling_model(
                s, args.n_actions, scope="q_target")
        q_target_vars = scope_vars("q_target")
    # -----double---------

    # define the cost function
    readout_action = tf.reduce_sum(tf.multiply(q_func, action), axis=1)
    td_errors = target - readout_action
    cost = tf.reduce_mean(tf.square(td_errors))
    train_step = tf.train.AdamOptimizer(args.lr).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # -----prioritized replay---------
    # initialize replay memory
    if args.prioritized:
        replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size,
                                                alpha=args.prioritized_alpha)
        beta_schedule = LinearSchedule(args.prioritized_beta_iter,
                                       initial_p=args.prioritized_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(args.replay_buffer_size)
    # -----prioritized replay---------
    ''' printing
    a_file = open("logs_" + args.game + "/readout.txt", 'w')
    h_file = open("logs_" + args.game + "/hidden.txt", 'w')
    '''

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(args.n_actions)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)  # s_t : 80 * 80 * 4

    # load networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state("saved_networks/" + save_dir)
    already_trained = 0
    if checkpoint and checkpoint.model_checkpoint_path:
        already_trained = checkpoint.model_checkpoint_path
        already_trained = int(already_trained[already_trained.find('dqn-') +
                                              4:])
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    EpsilonSchedule = LinearSchedule(args.explore, args.final_eps,
                                     args.initial_eps)
    t = already_trained
    epsilon = EpsilonSchedule.value(t)
    while "flappy bird" != "angry bird":
        #-----double---------
        # whether update q_target
        if args.double and t % args.target_update_freq == 0:
            sess.run(update_target(q_func_vars, q_target_vars))
        # -----double---------

        # choose an action epsilon greedily
        Q_t = q_func.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([args.n_actions])
        action_index = 0
        if t % args.frame_per_action == 0:
            action_index = random.randrange(
                args.n_actions) if random.random() < epsilon else np.argmax(
                    Q_t)
        a_t[action_index] = 1

        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        s_t1 = preprocess(s_t, x_t1_colored)

        # store the transition in D
        replay_buffer.add(s_t, a_t, r_t, s_t1, terminal)

        # only scale down epsilon if done observing
        if t > args.observe:
            epsilon = EpsilonSchedule.value(t - args.observe)

        # only train if done observing
        if t > args.observe + already_trained:
            # -----prioritized replay---------
            # sample a minibatch to train on
            if args.prioritized:
                experience = replay_buffer.sample(
                    args.batch_size,
                    beta=beta_schedule.value(t - args.observe -
                                             already_trained))
                (s_j_batch, a_batch, r_batch, s_j1_batch, done_batch, weights,
                 batch_idxes) = experience
            else:
                s_j_batch, a_batch, r_batch, s_j1_batch, done_batch = replay_buffer.sample(
                    args.batch_size)
            # -----prioritized replay---------

            target_batch = []
            # -----double---------
            Q_j1_batch = q_target.eval(
                feed_dict={s: s_j1_batch}) if args.double else q_func.eval(
                    feed_dict={s: s_j1_batch})
            # -----double---------

            for i in range(0, args.batch_size):
                terminal = done_batch[i]
                # if terminal, only equals reward
                if terminal:
                    target_batch.append(r_batch[i])
                else:
                    target_batch.append(r_batch[i] +
                                        args.gamma * np.max(Q_j1_batch[i]))

            # -----prioritized replay---------
            if args.prioritized:
                td_errs = td_errors.eval(feed_dict={
                    target: target_batch,
                    action: a_batch,
                    s: s_j_batch
                })
                new_priorities = np.abs(td_errs) + args.prioritized_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)
            # -----prioritized replay---------

            # perform gradient step
            train_step.run(feed_dict={
                target: target_batch,
                action: a_batch,
                s: s_j_batch
            })

        # update the old values
        s_t = s_t1
        t += 1

        # save
        if t % args.save_freq == 0:
            saver.save(sess,
                       "saved_networks/" + save_dir + args.game + '-dqn',
                       global_step=t)

        # display
        if t <= args.observe:
            state = "observe"
        elif t > args.observe and t <= args.observe + args.explore:
            state = "explore"
        else:
            state = "train"
        info_expr = 'TIMESTEP:{}, STATE:{}, EPSILON:{:6f}, ACTION{}, REWARD:{}, Q_MAX:{}'
        print(
            info_expr.format(t, state, epsilon, action_index, r_t,
                             np.max(Q_t)))

        # write info to files
        '''
Ejemplo n.º 10
0
def play(train_indicator):
    buffer_size = 100000
    batch_size = 32
    gamma = 0.99  # discount factor
    tau = 0.001  # Target Network HyperParameter
    lra = 0.0001  # Learning rate for Actor
    lrc = 0.001  # Learning rate for Critic
    ou_sigma = 0.3

    action_dim = 1  # Steering angle
    state_dim = 21  # num of sensors input

    episodes_num = 2000
    max_steps = 100000
    step = 0

    train_stat_file = "data/train_stat.txt"
    actor_weights_file = "data/actor.h5"
    critic_weights_file = "data/critic.h5"

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    tf_session = tf.Session(config=config)

    keras_backend.set_session(tf_session)

    actor = ActorNetwork(tf_session=tf_session,
                         state_size=state_dim,
                         action_size=action_dim,
                         hidden_units=(300, 600),
                         tau=tau,
                         lr=lra)
    critic = CriticNetwork(tf_session=tf_session,
                           state_size=state_dim,
                           action_size=action_dim,
                           hidden_units=(300, 600),
                           tau=tau,
                           lr=lrc)
    buffer = ReplayBuffer(buffer_size)

    # noise function for exploration
    ou = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim),
                                      sigma=ou_sigma * np.ones(action_dim))

    # Torcs environment - throttle and gear change controlled by client
    env = TorcsEnv(vision=False, throttle=False, gear_change=False)

    try:
        actor.model.load_weights(actor_weights_file)
        critic.model.load_weights(critic_weights_file)
        actor.target_model.load_weights(actor_weights_file)
        critic.target_model.load_weights(critic_weights_file)
        print("Weights loaded successfully")
    except:
        print("Cannot load weights")

    for i in range(episodes_num):
        print("Episode : %s Replay buffer %s" % (i, len(buffer)))

        if i % 3 == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        # 21 len state dimensions - https://arxiv.org/abs/1304.1672
        state = np.hstack((ob.angle, ob.track, ob.trackPos))

        total_reward = 0.
        for j in range(max_steps):
            loss = 0

            action_predicted = actor.model.predict(
                state.reshape(1,
                              state.shape[0])) + ou()  # predict and add noise

            observation, reward, done, info = env.step(action_predicted[0])

            state1 = np.hstack(
                (observation.angle, observation.track, observation.trackPos))

            buffer.add((state, action_predicted[0], reward, state1,
                        done))  # add replay buffer

            # batch update
            batch = buffer.get_batch(batch_size)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + gamma * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.get_gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.train_target_model()
                critic.train_target_model()

            total_reward += reward
            state = state1

            print("Episode %s - Step %s - Action %s - Reward %s" %
                  (i, step, action_predicted[0][0], reward))

            step += 1
            if done:
                break

        if i % 3 == 0 and train_indicator:
            print("Saving weights...")
            actor.model.save_weights(actor_weights_file, overwrite=True)
            critic.model.save_weights(critic_weights_file, overwrite=True)

        tm = time.strftime("%Y-%m-%d %H:%M:%S")
        episode_stat = "%s -th Episode. %s total steps. Total reward: %s. Time %s" % (
            i, step, total_reward, tm)
        print(episode_stat)
        with open(train_stat_file, "a") as outfile:
            outfile.write(episode_stat + "\n")

    env.end()
Ejemplo n.º 11
0
        for i in range(num_agents)
    ]

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction

    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(
        [agent_actor_target_init_list[:], agent_critic_target_init_list[:]])

    summary_writer = tf.summary.FileWriter('./VUE_summary',
                                           graph=tf.get_default_graph())

    for i in range(num_agents):
        mem = ReplayBuffer(10000)
        memory.append(mem)

    # for every 100 step, check the rewards
    reward_100_list = np.zeros([100, 1], dtype=float)
    sum_r = 0.
    for i in range(1, Episode + 1):
        print(str(i) + "번째 에피소드 시작..")
        if i % 100 == 0:
            print(str(i) + "번째 에피소드. 환경 리셋.(100 배수)")
            o_n = env.reset()
            for agent_index in range(num_agents):
                summary_writer.add_summary(
                    sess.run(
                        reward_100_op[agent_index], {
                            reward_100[agent_index]:
Ejemplo n.º 12
0
def train(conf,
          env,
          model,
          num_episodes=500,
          batch_size=100,
          buffer_size=10000):
    conf.buffer_size = buffer_size
    conf.batch_size = batch_size

    replay_buffer = ReplayBuffer(size=buffer_size)
    discount_rate = conf.discount_rate
    eps = conf.initial_eps
    decay_factor = conf.decay_factor
    for episode in range(num_episodes):
        print("Episode {}".format(episode))
        observation = env.reset()
        eps *= decay_factor
        done = False
        total_food = 0
        step = 0
        while not done:
            model_input = np.array([observation])
            prediction = model.predict(model_input)
            if np.random.random() < eps:
                action = np.random.randint(0, 4)
                was_random = True
            else:
                action = np.argmax(prediction)
                was_random = False

            debugger.print_step_before_move(step, observation, prediction,
                                            action, was_random)

            debugger.render_env_until_key_press(env)

            new_observation, reward, done, _ = env.step(action)

            replay_buffer.add(observation, action, reward, new_observation,
                              float(done))

            # target_action_score = reward + (0 if done else discount_rate * np.max(model.predict(
            #     np.array([new_observation]))))

            # label = prediction
            # label[0][action] = target_action_score
            # model.fit(model_input, label, epochs=1,
            #           verbose=0)

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            labels = model.predict(obses_t)
            targets = discount_rate * np.max(model.predict(obses_tp1), axis=1)
            # print('targets', targets)
            # print('rewards', rewards)
            for i in range(len(dones)):
                if dones[i]:
                    targets[i] = 0
                targets[i] += rewards[i]
                labels[i][actions[i]] = targets[i]
            model.fit(obses_t, labels, epochs=1, verbose=0)

            weights, batch_idxes = np.ones_like(rewards), None

            # debugger.print_step_after_move(reward, target_action_score,
            #                       label, model.predict(model_input))

            if (reward > 0):
                total_food += 1
            step += 1

            observation = new_observation
        wandb.log({
            'episode': episode,
            'total_food': total_food,
            'eps': eps,
            'lifetime': step
        })
        print('Score: {}'.format(total_food))
        print()
    env.close()