Ejemplo n.º 1
0
                            train_batch = exp_buffer.sample(batch_size)

                            pred_act, _ = main_qn.predict_act(
                                np.vstack(train_batch[:, 3]), batch_size, sess)
                            _, q_vals = target_qn.predict_act(
                                np.vstack(train_batch[:, 3]), batch_size, sess)

                            end_multiplier = -(train_batch[:, 4] - 1)
                            double_q = q_vals[range(batch_size), pred_act]
                            target_q_val = train_batch[:,
                                                       2] + gamma * double_q * end_multiplier

                            in_frames = np.vstack(train_batch[:, 0])
                            acts = train_batch[:, 1]
                            main_qn.update_nn(in_frames, target_q_val, acts,
                                              batch_size, sess, summ_writer,
                                              step_value)
                            step_value = sess.run(inc_global_step)

                    s = s1
                    s_frame = s1_frame

                ep_rewards.append(reward)
                total_step += 1

                if total_step % update_target_step == 0:
                    sess.run(update_qn_op)

                if done:
                    disc_r = discounted_reward(ep_rewards, gamma)
                    score = discounted_reward(ep_rewards, 1)