Beispiel #1
0
 def get_demonstration(self, fold):
     state1 = self.get_instance(fold)
     action1 = (0, 2)
     state2, r1, _ = state1.step(action1)
     action2 = (0, state1.target)
     state3, r2, _ = state2.step(action2)
     assert r2 == 1
     ep = []
     ep.append(Experience(state1, None, action1, state2, None, r1, False))
     ep.append(Experience(state2, None, action2, state3, None, r2, True))
     return ep
Beispiel #2
0
 def get_experience(self, key):
     with dbapi2.connect(self.app.config['dsn']) as connection:
         cursor = connection.cursor()
         query = "SELECT TITLE, USERNAME, START,FINISH,PERIOD,LENGTH FROM EXPERIENCE WHERE (ID = %s)"
         cursor.execute(query, (key,))
         title, username, start, finish, period,length = cursor.fetchone()
     return Experience(title, username, start, finish, period,length)
Beispiel #3
0
def _do_tr_rollout(code_agent, desc_agent, task, rollout_ph, model, desc_model,
                   desc_to_code, code_to_desc, session, config, h0, z0, fold,
                   mode):
    worlds = [
        task.get_instance(fold)
        for _ in range(config.trainer.n_rollout_episodes)
    ]
    done = [False] * config.trainer.n_rollout_episodes
    episodes = [[] for i in range(config.trainer.n_rollout_episodes)]
    hs, zs = h0, z0
    dhs = h0
    for t in range(config.trainer.n_timeout):
        hs_, zs_, qs = session.run(
            [model.tt_rollout_h, model.tt_rollout_z, model.tt_rollout_q],
            rollout_ph.feed(hs, zs, dhs, worlds, task, config))
        dhs_, dqs = session.run(
            [desc_model.tt_rollout_h, desc_model.tt_rollout_q],
            rollout_ph.feed(hs, zs, dhs, worlds, task, config))
        for i in range(config.trainer.n_rollout_episodes):
            if done[i]:
                continue

            actions = [None, None]
            actions[code_agent] = np.argmax(qs[code_agent][i, :])
            actions[desc_agent] = np.argmax(dqs[desc_agent][i, :])

            world_, reward, done_ = worlds[i].step(actions)

            code = desc_to_code(world_.l_msg[code_agent], mode)[0]
            zs_[desc_agent][i, :] = code

            l_words = code_to_desc(zs_[code_agent][i, :], mode)[:5]
            l_msg = np.zeros(len(task.lexicon))
            for l_word in l_words:
                l_msg[task.lexicon.index(l_word)] += 1
            l_msg /= np.sum(l_msg)

            world_.l_msg = list(world_.l_msg)
            world_.l_msg[desc_agent] = l_msg
            world_.l_msg = tuple(world_.l_msg)

            episodes[i].append(
                Experience(worlds[i], None, tuple(actions), world_, None,
                           reward, done_))
            worlds[i] = world_
            done[i] = done_

            if config.evaluator.simulate_l:
                assert False

        hs = hs_
        zs = zs_
        dhs = dhs_
        if all(done):
            break

    return (sum(e.r for ep in episodes
                for e in ep) * 1. / config.trainer.n_rollout_episodes,
            sum(ep[-1].s2.success
                for ep in episodes) * 1. / config.trainer.n_rollout_episodes)
    def add_grains(self, grains: List[List[Image.Image]]):
        """Add new grains to memory

        Params:
            grains: List[List[Image.Image]]
                2D List of new grains

        Returns:
            2D List of novelty for new grains
        """

        # print("Adding new grains to memory...")
        assert len(grains) == 2  # Currently, we only allow 4 grains
        assert len(grains[0]) == 2  # Currently, we only allow 4 grains
        nov_list = []

        for row in grains:
            temp_nov = []
            for g in row:
                grain_tf = self._grain_to_tensor(g)
                grain_tf = tf.reshape(
                    grain_tf, (1, grain_tf.shape[0], grain_tf.shape[1],
                               grain_tf.shape[2]))  # Reshape to (1,H,W,C)
                predicted_grain = self._network(grain_tf)
                nov = self.novelty_function(grain_tf, predicted_grain).numpy()
                temp_nov.append(nov)
                self._memory.push(Experience(nov, g))
            nov_list.append(temp_nov)

        return nov_list
Beispiel #5
0
    def run_episode(self, max_steps, train=True):
        """
        Executes a single episode.

        Params
        ======
        max_steps (integer): The maximum time steps to run in a single episode.
        train (Boolean): If true, run episode in train mode.  If false, run in
            eval mode.
        """

        env_info = self.env.reset(train_mode=train)[self.brain_name]
        states = env_info.vector_observations
        scores = np.zeros(len(states))
        for i in range(max_steps):
            actions = self.agent.act(states, noise=train)
            env_info = self.env.step(actions)[self.brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            scores += rewards
            self.agent.learn(
                Experience(states, actions, rewards, next_states, dones))
            if dones[0]:
                break
            states = next_states
        self.agent.end_episode()
        return scores.max()
Beispiel #6
0
 def get_myexperiences(self,name):
       with dbapi2.connect(self.app.config['dsn']) as connection:
         cursor = connection.cursor()
         query = "SELECT * FROM EXPERIENCE where (username=%s)"
         cursor.execute(query,(name,))
         experiences = [(key, Experience(title, username, start, finish, period, length))
                   for key, title, username, start, finish, period, length,userid,date in cursor]
         return experiences
Beispiel #7
0
def run(env,
        num_episodes,
        num_time_steps,
        replay_batch_size,
        scores_filename=None):
    exploration = EpsilonGreedy(epsilon_start=1.0,
                                epsilon_min=0.01,
                                epsilon_decay=0.999)

    # [Mnih 2015] used:
    #  - replay over 2% of the total experience
    #  - batch size of 32
    #  - minimum replay start size of 0.1%
    experience_max_size = int(num_episodes * num_time_steps * 0.02)
    replay_start_size = int(num_episodes * num_time_steps * 0.001)
    experience_replay = Experience(max_size=experience_max_size,
                                   batch_size=replay_batch_size,
                                   replay_start_size=replay_start_size)
    # experience_replay = PrioritizedExperience(
    #     max_size=experience_max_size, batch_size=replay_batch_size, replay_start_size=replay_start_size,
    #     initial_td_error=10, alpha=0.4, beta=0.4, anneal_rate=0.95, epsilon=0.001)

    model = ExampleModel(state_size=env.state_size,
                         action_size=env.action_size,
                         learning_rate=0.001)
    model.build()
    target_model = ExampleModel(state_size=env.state_size,
                                action_size=env.action_size,
                                learning_rate=0.001)
    target_model.build()

    qmodel = QModel(model=model,
                    target_model=target_model,
                    experience_replay=experience_replay,
                    tau=0.1,
                    use_double_q=True)

    agent = QAgent(state_size=env.state_size,
                   action_size=env.action_size,
                   model=qmodel,
                   exploration=exploration,
                   discount_rate=0.95)

    scores = np.empty((num_episodes, ))
    time_start = time.time()

    for e in range(num_episodes):
        scores[e] = agent.train(env=env, episode_length=num_time_steps)
        print('episode: {}/{}, score: {}, e: {:.2}'.format(
            e + 1, num_episodes, scores[e], agent.exploration.epsilon))

    time_end = time.time()
    print('Average score for last 10% of episodes:',
          np.mean(scores[int(np.floor(num_episodes * 0.1)):]))
    print('Time taken:', time_end - time_start, 'seconds')

    if scores_filename is not None:
        np.savetxt(scores_filename, scores, delimiter=',')
Beispiel #8
0
 def search_experience(self,keyword):
     with dbapi2.connect(self.app.config['dsn']) as connection:
         cursor = connection.cursor()
         query="SELECT * FROM EXPERIENCE WHERE (TITLE ILIKE %s OR START ILIKE%s OR FINISH ILIKE %s ) ORDER BY ID"
         keyword='%'+keyword+'%'
         cursor.execute(query, (keyword,keyword,keyword))
         experiences = [(key, Experience(title, username, start, finish, period, length))
                   for key, title, username, start, finish, period, length,userid,date in cursor]
     return experiences
Beispiel #9
0
def read_experience():
    experience = Experience()
    experience.employer = input("Enter name of employer: ")
    experience.title = input("Enter title: ")
    experience.responsibilities = input("Enter responsibilities: ")
    experience.duration_years = int(
        input("Enter number of years of experience: "))

    return experience
def extract_tensors(experiences):
    # Convert batch of Experiences to Experience of batches
    batch = Experience(*zip(*experiences))

    t1 = torch.cat(batch.state)
    t2 = torch.cat(batch.action)
    t3 = torch.cat(batch.reward)
    t4 = torch.cat(batch.next_state)

    return t1, t2, t3, t4
Beispiel #11
0
 def to_experiences(self, states, actions, rewards, next_states, dones):
     """
     Turns vectors representing components of multiple experiences into a
     vector of Experience objects.
     """
     experiences = []
     for (state, action, reward, next_state,
          done) in zip(states, actions, rewards, next_states, dones):
         experiences.append(
             Experience(state, action, reward, next_state, done))
     return experiences
    def generate_experience(self, nr, serialize=False):
        result = []

        for x in range(nr):
            domain = self.__domains[random.randint(0, len(self.__domains) - 1)]
            projects = self.generate_projects(random.randint(1, 5), False)
            exp = Experience(domain, random.randint(1, 15), projects)
            result.append(exp)

        if serialize:
            return [exp.serialize() for exp in result]
        else:
            return result
Beispiel #13
0
    def __init__(self,
                 num_action,
                 frame_height,
                 frame_width,
                 rng,
                 network_type,
                 algorithm,
                 network_file=None,
                 num_ignore=0,
                 exp_file=None):
        self.rng = rng
        self.num_action = num_action
        self.mbsize = Agent.MINIBATCH_SIZE
        self.validate_size = Agent.VALID_SIZE
        self.num_train_obs = 0
        self.network_type = network_type
        self.eps_decay = (Agent.FINAL_EXPLORE - Agent.INIT_EXPLORE) \
            / Agent.EXPLORE_FRAMES

        self.validate_states = None
        self.exp_file = exp_file
        if exp_file is not None:
            with open(exp_file, 'rb') as f:
                npz = np.load(exp_file)
                self.num_train_obs = np.sum(npz['num_train_obs'])
                self.validate_states = npz['validate_states']
                self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height,
                                            frame_width, Agent.HISTORY, rng,
                                            npz)
        else:
            self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height,
                                        frame_width, Agent.HISTORY, rng)
        self.exp_eval = Experience(Agent.HISTORY + 1, frame_height,
                                   frame_width, Agent.HISTORY, rng)

        self.network = Network(num_action, self.mbsize, Agent.HISTORY,
                               frame_height, frame_width, Agent.DISCOUNT,
                               Agent.UPDATE_FREQ, rng, network_type, algorithm,
                               network_file, num_ignore)
Beispiel #14
0
  def __init__(self, env=None, agent=None, logdir=None, should_render=None, should_load=None):
    self.env = env
    self.agent = agent
    self.config = self.agent.config
    self.logdir = logdir
    self.should_render = should_render
    self.experience = Experience(self.config)

    if should_load:
      self.load()
    else:
      self.step = 0
      self.epsilon = 0.3
      self.train_rewards = [0] * 100
      self.current_episode = 0
def main():
    from experience import Experience
    from visualization import hist_classes, scatter_classes

    class_count_list = []
    agents = [
        'RainbowAgent', 'SimpleAgent', 'SecondAgent', 'ProbabilisticAgent'
    ]

    for agent in agents:

        exp = Experience(agent, load=True)
        labels, _, examples, _ = exp.load()
        class_count, _ = divide_and_count(examples, labels)
        class_count_list.append(class_count)

    scatter_classes(class_count_list, agents)
Beispiel #16
0
 def run_episode(self):
     self.env.reset()
     self.env.random_start()
     t = 0
     experiences = []
     
     
     while(not self.env.terminal):
         #predict action, value
         action, value = self.predict(self.env.state)
         self.env.step(action)
         
         experience = Experience(self.env.state, action, self.env.reward,
                                 None,
                                 self.env.terminal)
         experiences.append(experience)
         yield experience
         t += 1
Beispiel #17
0
 def run_episode(self, train=True):
     env_info = self.env.reset(train_mode=train)[self.brain_name]
     state = env_info.vector_observations[0]
     score = 0
     while True:
         action = self.agent.act(state, explore=train)
         env_info = self.env.step(action)[self.brain_name]
         next_state = env_info.vector_observations[0]
         reward = env_info.rewards[0]
         done = env_info.local_done[0]
         score += reward
         if train:
             self.agent.learn(
                 Experience(state, action, reward, next_state, done))
         state = next_state
         if done:
             break
     return score
Beispiel #18
0
def main(args):
    with tf.device(args['device']):

        # tf
        tf.set_random_seed(args['rand_seed'])
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        # env
        env = gym.make('TestEnv-v0')
        env.seed(args['rand_seed'])
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]
        concat_dim = 2
        batched_s_dim = [None, s_dim, concat_dim]
        batched_a_dim = [None, a_dim]

        # agents
        actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'],
                      args['clip_val'], batched_s_dim, batched_a_dim)
        critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'],
                        batched_s_dim, batched_a_dim)

        # experience
        exp = Experience(args['buffer_size'], args['batch_size'],
                         args['rand_seed'])

        # noise
        actor_noise = ActorNoise(actor.predict,
                                 a_dim,
                                 noise_type=args['noise_type'])

        # initialize
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = Model(sess, args['restore_path'])
        saver.restore_model()

        # training
        her = HER(saver, exp, env, actor, critic, actor_noise)
        if args['mode'] == 'train':
            her.train(args['gamma'], args['her_k'], args['max_episodes'],
                      args['max_episode_len'], args['replay_len'])
        else:
            her.play(args['max_episodes'], args['max_episode_len'])
Beispiel #19
0
    def run_episode(self):
        self.env.reset()
        self.history.add(self.env.state)

        random_start_steps = max(self.config.history_length,
                                 self.env.random_start_steps)
        for _ in range(random_start_steps):
            self.env.step(self.env.random_step())
            self.history.add(self.env.state)

        t = 0
        while (not self.env.terminal):
            #predict action, value
            prev_state = self.env.state
            action = self.predict(self.history.get())
            self.env.step(action)
            experience = Experience(prev_state, action, self.env.reward,
                                    self.env.state, self.env.terminal)
            yield experience
            t += 1
Beispiel #20
0
    def __init__(self, model, env, action_size, args, state):
        self.model = model
        self.env = env
        self.action_size = action_size
        self.state = state
        self.hx = None
        self.cx = None
        self.eps_len = 0
        self.args = args
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        self.done = True  # 初始化,可以设置一次新的状态值
        self.info = None
        self.reward = 0
        self.gpu_id = -1
        # 参数

        self.memory = Experience(history_size=2000)
Beispiel #21
0
def index():
    app.logger.info("Connection from %s" % str(request.environ['REMOTE_ADDR']))
    database = databaseconnector.databaseObject(app)
    skills = dict(database.query("SELECT title, rating FROM skills"))
    tools = dict(database.query("SELECT title, rating FROM tools"))
    languages = dict(database.query("SELECT title, rating FROM languages"))
    experience = database.query(
        "SELECT title, dates, shortDescription, longDescription, image, id FROM experience"
    )
    experienceStructs = []
    for line in experience:
        newStruct = Experience(line[0], line[1], line[2], line[3], line[4],
                               'a' + str(line[5]))
        experienceStructs.append(newStruct)

    database.close()
    return render_template('index.html',
                           skills=skills,
                           tools=tools,
                           languages=languages,
                           experiences=experienceStructs)
Beispiel #22
0
def main():

    candidate_database = CandidateDatabase()

    candidate = Candidate()
    candidate.name = "Alicia Toomtest"
    candidate.title = "Python Developer"
    candidate.address = "Gothenburg, Sweden"
    candidate.phone = "0722879879"
    candidate.email = "*****@*****.**"
    candidate.hobbies = "Gardening"

    candidate.education = [
        Education(name="Education", school="School", level="Level"),
        Education(name="Education2", school="School2", level="Level2")
    ]

    candidate.experience = [
        Experience(employer="Volvo",
                   title="Python developer",
                   responsibilities="code",
                   duration_years="2018-present")
    ]

    candidate.note = Note()
    candidate.note.summary = "Gslf9ehdlsdfnjslsleofjfms,"
    candidate.note.comment = "dki9eufsklwodudndjskwoeifjdk"

    # print_candidate(candidate)
    candidate_database.add_candidate(candidate)
    find_result = candidate_database.find_candidates("toom")

    if len(find_result) > 0:
        print_candidates(find_result)
    else:
        print("No result found")

    print_candidate(candidate)

    return
Beispiel #23
0
    def __init__(self, rank, args, shared_model, optimizer, lr):
        # CUDA 相关
        self.gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
        torch.manual_seed(args.seed + rank)
        if self.gpu_id >= 0:
            torch.cuda.manual_seed(args.seed + rank)

        self.replay_buffer = Experience(history_size=2000)
        self.cx = None  # todo: 仍然是 一次 step 就前向传播
        self.hx = None
        self.episodic_score = 0
        self.rank = rank
        self.args = args
        self.shared_model = shared_model
        self.optimizer = optimizer
        self.local_t = 0
        # 初始化
        # 初始化环境
        print('Training Agent: {}'.format(self.rank))
        # todo: 需要给 gym 环境加上 pc 等

        # agent 代理对象
        model = UNREAL(in_channels=3, action_size=6, enable_pixel_control=True)

        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                model = model.cuda()

        model.train()

        # 学习率
        self.initial_learning_rate = lr
        self.max_global_time_step = 10 * 10**7
        # 记录时间
        # For log output
        self.prev_local_t = 0

        self.model = model
        self.env = None
        self.reset()  # cx hx
Beispiel #24
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_pixel_change, use_value_replay, use_reward_prediction,
                 pixel_change_lambda, entropy_beta, local_t_max, gamma,
                 gamma_pc, experience_history_size, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)

        self.local_network = UnrealModel(self.action_size, thread_index,
                                         use_pixel_change, use_value_replay,
                                         use_reward_prediction,
                                         pixel_change_lambda, entropy_beta,
                                         device)
        self.local_network.prepare_loss()

        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.total_loss, global_network.get_vars(),
            self.local_network.get_vars())

        self.sync = self.local_network.sync_from(global_network)
        self.experience = Experience(self.experience_history_size)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # For log output
        self.prev_local_t = 0
Beispiel #25
0
def qtest(model_eval, model_target,epsilon=0.05, n_vehicles=20, **opt):
    # global epsilon
    epoch = 0
    n_epoch = opt.get('n_epoch', 1500)
    max_memory = opt.get('max_memory', 1000)
    data_size = opt.get('data_size', 50)
    weights_file = opt.get('weights_file', "")
    start_time = datetime.datetime.now()
    n_vehicles = n_vehicles

    get_dest_count = 0
    failed_count = 0

    eval_train_time = 0
    # If you want to continue training from a previous model,
    # just supply the h5 file name to weights_file option
    if weights_file:
        print("loading weights from file: %s" % (weights_file,))
        model_eval.load_weights(weights_file)

    # Initialize experience replay object
    experience = Experience(model_eval, model_target, max_memory=max_memory)

    # records : 记录各种指标
    # reward部分:
    records_of_total_reward = []
    records_of_veh_reward = []
    records_of_se_reward = []
    # 这一项totalreward是veh与se_reward的单纯相加,不带权值。
    records_of_pure_total_reward = []
    # 需要输出的部分:
    records_of_veh_drive = []
    records_of_veh_drive_speed = []
    records_of_se_delay = []
    records_of_se_SR = []

    veh_actions_record = [ [] for veh in range(n_vehicles) ]
    se_actions_record = [ [] for veh in range(n_vehicles) ]

    for epoch in range(n_epoch):

        qmaze.reset()
        # print("训练一轮结束,重置qmaze")
        # print(qmaze.vehs_og_list)
        seenv.reset(qmaze)#SE相关环境信息的重置
        game_over = False

        # get initial envstate
        envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)]
        se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)]

        #reward_list 是用来计算SE部分和Veh行车部分两个reward的累加值,即“总的reward”。
        reward_list = np.zeros((n_vehicles))
        veh_step_reward_list = np.zeros((n_vehicles))
        se_step_reward_list = np.zeros((n_vehicles))

        n_episodes = 0

        while not game_over:
            # 开始前都将需要记录这个tag置零
            qmaze.should_save = [0] * n_vehicles

            # 开始前把每一步经历的step_cost_list全部置零
            veh_step_cost_list = np.zeros((n_vehicles))
            se_step_cost_list = np.zeros((n_vehicles))
            pure_reward_list = np.zeros((n_vehicles))
            prev_envstates_list = envstates_list
            prev_se_envstates_list = se_envstates_list
            # 获取当前各个网格车辆的信息,汽车行动前的网格汽车数目
            # _2nd_channel = background_vehs[np.newaxis,:]
            # background_vehs = background_vehs[np.newaxis,:]
            vehsnum_before_act = qmaze.count_cells_vehsnum()
            sesnum_before_act = seenv.SE_count(qmaze)

            actions = -1 * np.ones((n_vehicles)) # 默认 -1,表示不采取行动
            se_actions = -1 * np.ones((n_vehicles))  # 默认 -1,表示不采取行

            nVehsCell_list = qmaze.count_cell_vehsnum_list()
            nSEsCell_list  = seenv.SE_count_list(qmaze)
            # print(nVehsCell_list)
            # print(veh_actions_record)
            # print(nSEsCell_list)
            # print(se_actions_record)
            # 每个时段的的开头部分
            # 在时段的开头先统计各个网格的车辆,
            # 从而得知,在这一个时段内,汽车们、SE们可能得到的拥挤开销
            # 这等于是在哥哥时刻开始之前计算拥挤开销被计算入reward之前的拥挤开销的积累
            qmaze.get_veh_cost(veh_step_cost_list)
            seenv.get_se_cost(se_step_cost_list, qmaze)
            # 计算一个评判行车指标drive_cost
            qmaze.get_drive_cost(background_vehs,nVehsCell_list)
            # 每个时段的的开头部分
            for veh in range(n_vehicles):
                if qmaze.status_flag[veh] != 0:
                    # 说明汽车已经1或者-1了,不用在对其进行动作。
                    continue
                else:
                    # 进入这一分支说明该车还没结束
                    if qmaze.last_act_done[veh] == 1:
                    # 汽车是刚开始的状态或者完成了上一个动作的执行,需要委派新的动作。
                        valid_actions = qmaze.valid_actions(qmaze.vehs_cur_list[veh])

                        qmaze.vehs_change_act[veh] = actions[veh] = np.argmax(
                            experience.predict_e(prev_envstates_list[veh], vehsnum_before_act,
                                                 prev_se_envstates_list[veh], sesnum_before_act))
                        seenv.SEs_next_mrg_list[veh] = se_actions[veh] = np.argmax(
                            experience.predict_e_se(prev_envstates_list[veh], vehsnum_before_act,
                                                    prev_se_envstates_list[veh], sesnum_before_act))
                        veh_actions_record[veh].append(qmaze.vehs_change_act[veh])
                        se_actions_record[veh].append(seenv.SEs_next_mrg_list[veh])
                        # 重新指定time_remain_list
                        # 获取reward_will_get
                        # 还得记录一下vehs的nexttogo
                        qmaze.get_some_will(actions[veh], veh)

                    if qmaze.last_act_done[veh] == 0:
                    # 未完成上一个迁移动作,SEs和Vehs的状态都是GOING
                        actions[veh] = GOING
                        se_actions[veh] = GOING
                        continue
            # 每个时段的结尾部分
            nVehsCell_list = qmaze.count_cell_vehsnum_list()
            nSEsCell_list = seenv.SE_count_list(qmaze)
            vehcurpos = qmaze.vehs_cur_list

            for veh in range(n_vehicles):
                # 查询汽车是否为“完成状态”,即到达终点或者累计reward过大;
                if qmaze.status_flag[veh] != 0:
                    # 说明该车状态为完成,即已到达终点或者累计reward过大
                    continue
                else:
                    # 说明汽车不是“完成”状态。
                    qmaze.time_remain_list[veh] -= 1
                    qmaze.action_times_list[veh] += 1
                    nVehsCell = nVehsCell_list[transfer_dict[vehcurpos[veh]]]
                    nSEsCell = nSEsCell_list[transfer_dict[seenv.SE_curpos_list[veh]]]
                    veh_pos = vehcurpos[veh]
                    se_pos = seenv.SE_curpos_list[veh]
                    bv = seenv.SE_data_list[veh]
                    cpuneed = seenv.CPUNeedList[veh]
                    if qmaze.time_remain_list[veh] != 0:
                        # 首先把last_act_done设置成0,表示上一个动作未做完
                        qmaze.last_act_done[veh] = 0
                        # 说明未发生迁移,所以得到的reward都为0,只是记录服务时延
                        veh_step_reward_list[veh] = 0
                        # 计算一下服务时延,
                        # 记录一下时延——后面看一下总的时延时间;
                        # 记录一下服务时延的成功率;
                        delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed)
                        if delay_time < Delay_Threshold:
                            seenv.record_success_rate[veh].append(1)
                        else:
                            seenv.record_success_rate[veh].append(0)
                        seenv.record_delay_list[veh].append(delay_time)
                        seenv.SE_delay_list[veh] += delay_time
                        se_step_reward_list[veh] = 0
                        continue
                    if qmaze.time_remain_list[veh] == 0:
                        # 说明将要完成迁移
                        # 首先把last_act_done设置成1,表示上一个动作已经做完
                        qmaze.last_act_done[veh] = 1
                        # qmaze.should_change[veh] = 1
                        # veh的rewrd是直接读取will_get即可
                        # se部分计算时延、计算reward
                        veh_step_reward_list[veh] = qmaze.reward_will_get[veh] + veh_step_cost_list[veh]
                        qmaze.reward_will_get[veh] = 0
                        veh_step_cost_list[veh] = 0
                        # 计算一下服务时延,
                        # 记录一下时延——后面看一下总的时延时间;
                        # 记录一下服务时延的成功率;
                        delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed)
                        if delay_time < Delay_Threshold:
                            seenv.record_success_rate[veh].append(1)
                        else:
                            seenv.record_success_rate[veh].append(0)
                        seenv.record_delay_list[veh].append(delay_time)
                        seenv.SE_delay_list[veh] += delay_time
                        avg_delay = seenv.SE_delay_list[veh] / qmaze.action_times_list[veh]
                        if avg_delay > Delay_Threshold:
                            delay_cost = - 0.20
                        else:
                            delay_cost = 0
                        seenv.SE_delay_list[veh] = 0  # 若计算了delaycost 要重新清零delay累加
                        se_step_reward_list[veh] = delay_cost + -1 * CostMrgS + se_step_cost_list[veh]
                        se_step_cost_list[veh] = 0 # 被计入reward过后的拥挤开销需要清零

                        # 计算累计的reward,加入了拥挤开销的reward
                        qmaze.game_acc_veh_reward[veh] += veh_step_reward_list[veh]
                        seenv.game_acc_se_reward[veh] += se_step_reward_list[veh]
                        pure_reward_list[veh] = veh_step_reward_list[veh] +  se_step_reward_list[veh]
                        reward_list[veh] = vr_weights * veh_step_reward_list[veh] + (1 - vr_weights) * se_step_reward_list[veh]
                        qmaze.game_acc_reward[veh] += reward_list[veh]
                        qmaze.game_acc_pure_reward[veh] += pure_reward_list[veh]
                        # 判断行车的accreward是否小于最低要求而游戏失败
                        if qmaze.game_acc_veh_reward[veh] < qmaze.min_reward:
                            # 行车失败
                            qmaze.status_flag[veh] = -1
                        else:

                            # 尚未达到失败的标准,继续行车:
                            # 根据车辆的will的得到的动作信息,更新veh与se状态。
                            # 判断一个是否到达终点的信息,即用来更新汽车状态,又用来在experience的判定因素。
                            qmaze.update_state_single__(veh)
                            seenv.update_se_pos_(veh)

                            # 维护一个should_save,表示有必要存储的经验过程数据
                            qmaze.should_save[veh] = 1

            print("E:",n_episodes)
            print("cur:",qmaze.vehs_cur_list)
            print("remain:",qmaze.time_remain_list)
            print("next:",qmaze.vehs_next_go_list)
            envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)]
            se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)]
            # 获取当前各个网格车辆的信息,汽车行动后的网格汽车数目
            vehsnum_after_act = qmaze.count_cells_vehsnum()
            sesnum_after_act = seenv.SE_count(qmaze)

            # print(qmaze.count_cell_vehsnum_list())
            # print(qmaze.acc_drive_cost)
            for veh in range(n_vehicles):
                if qmaze.should_save[veh] == 1:
                    # 记录一个episode经验
                    if qmaze.status_flag[veh] == 1:
                        get_dest = True
                    else:
                        get_dest = False
                    episode = [prev_envstates_list[veh], prev_se_envstates_list[veh],
                               qmaze.vehs_change_act[veh], seenv.SEs_next_mrg_list[veh],
                               reward_list[veh], veh_step_reward_list[veh], se_step_reward_list[veh],
                               envstates_list[veh], se_envstates_list[veh],
                               get_dest, vehsnum_before_act, vehsnum_after_act,
                               sesnum_before_act, sesnum_after_act]
                    experience.remember(episode)
                else:
                    continue

            # 重置两个计数器
            get_dest_count = 0
            failed_count = 0
            #  todo 判断条件要改
            # 计算已经完成的 actor 数量,包括 1(成功),2(失败)
            for veh in range(n_vehicles):
                if qmaze.status_flag[veh] == 1:
                    get_dest_count += 1
                elif qmaze.status_flag[veh] == -1:
                    failed_count += 1

            # 当所有的 actor 都已达到终点(不一定是最优解)或者未能达终点(失败),该 episode 训练结束
            if get_dest_count + failed_count == n_vehicles:
                game_over = True
            else:
                game_over = False

            n_episodes += 1


        # 一系列reward信息的统计
        sum_pure_total_reward = 0
        sum_total_reward = 0
        sum_veh_reward = 0
        sum_se_reward = 0
        for veh in qmaze.done_list:
            sum_total_reward += qmaze.game_acc_reward[veh]
            sum_veh_reward += qmaze.game_acc_veh_reward[veh]
            sum_se_reward += seenv.game_acc_se_reward[veh]
            sum_pure_total_reward += qmaze.game_acc_pure_reward[veh]

        if len(qmaze.done_list) == 0:
            records_veh = 0
            records_total = 0
            records_se = 0
            records_total_pure = 0
        else:
            records_veh = sum_veh_reward / len(qmaze.done_list)
            records_total = sum_total_reward / len(qmaze.done_list)
            records_se = sum_se_reward / len(qmaze.done_list)
            records_total_pure = sum_pure_total_reward / len(qmaze.done_list)
        records_of_total_reward.append(records_total)
        records_of_veh_reward.append(records_veh)
        records_of_se_reward.append(records_se)
        records_of_pure_total_reward.append(records_total_pure)
        # 统计汽车的行车代价
        sum_veh_drive = 0
        sum_veh_drive_speed = 0
        # print(qmaze.acc_drive_cost)
        for veh in qmaze.done_list:
            qmaze.game_drive_cost[veh] = qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh]
            qmaze.game_drive_speed[veh] = 1/qmaze.game_drive_cost[veh]
            sum_veh_drive += qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh]
            sum_veh_drive_speed += qmaze.game_drive_speed[veh]
        if len(qmaze.done_list) == 0:
            records_drive = 0
            records_drive_speed = 0
        else:
            records_drive = sum_veh_drive / len(qmaze.done_list)
            records_drive_speed = sum_veh_drive_speed / len(qmaze.done_list)
        records_of_veh_drive.append(records_drive)
        records_of_veh_drive_speed.append(records_drive_speed)

        # 统计各个车辆的在每一局(每一个训练轮次)的平均delay与服务成功率
        se_delay_avg = [0]*n_vehicles
        se_success_rate = [0]*n_vehicles


        for veh in qmaze.done_list:
            se_delay_avg[veh] = sum(seenv.record_delay_list[veh])/ len(seenv.record_delay_list[veh])
            se_success_rate[veh]  = sum(seenv.record_success_rate[veh])/ len(seenv.record_success_rate[veh])

        # 在一个轮次中对于各个车辆的delay与服务成功率进行求平均
        AvgDelayforAll = 0
        SRforAll = 0
        SRcount = 0
        for veh in qmaze.done_list:
            AvgDelayforAll += se_delay_avg[veh]
            SRforAll += sum(seenv.record_success_rate[veh])
            SRcount += len(seenv.record_success_rate[veh])
        if len(qmaze.done_list) == 0:
            records_delay = 0
        else:
            records_delay = AvgDelayforAll / len(qmaze.done_list)
        if SRcount == 0:
            records_SR =0
        else:
            records_SR = SRforAll / SRcount
        records_of_se_delay.append(records_delay)
        records_of_se_SR.append(records_SR)

        dt = datetime.datetime.now() - start_time
        t = format_time(dt.total_seconds())
        template = "Epoch: {:03d}/{:d} | Episodes: {:d} | GetDestCount: {:d}/{:d} |FailedCount: {:d}/{:d}| time: {},| loss_weight:{}"
        print(template.format(epoch, n_epoch - 1,n_episodes ,get_dest_count, n_vehicles ,failed_count, n_vehicles, t, model_eval.loss_weights))
        print("Arrived vehs:", qmaze.done_list)
        print("SE_delay_avg:",se_delay_avg)
        print("SE_success_rate:",se_success_rate)
        print("veh_drive", qmaze.game_drive_speed)
        print("【AVG_Veh_drive】:", records_drive)
        print("【AVG_Veh_drive_speed】:", records_drive_speed)

    end_time = datetime.datetime.now()
    dt = datetime.datetime.now() - start_time
    seconds = dt.total_seconds()
    t = format_time(seconds)
    weight_rate = '_'+ str(vr_weights) + '_'+str(seenv.datasize_base)+'_'+str(n_vehicles) +'_'
    # 保存训练过程中的指标
    parent_path = 'save_res0108/'
    method_name = 'merge_test' + weight_rate
    print("Saving Reward:")
    with open(parent_path + method_name + "total_reward.pl", 'wb') as f:
        print("recording total_reward..")
        pickle.dump(records_of_total_reward, f)

    with open(parent_path + method_name +"pure_total_reward.pl", 'wb') as f:
        print("recording pure total_reward..")
        pickle.dump(records_of_pure_total_reward, f)

    with open(parent_path + method_name + "veh_reward.pl", 'wb') as f:
        print("recording veh_reward..")
        pickle.dump(records_of_veh_reward, f)

    with open(parent_path + method_name + "se_reward.pl", 'wb') as f:
        print("recording se_reward..")
        pickle.dump(records_of_se_reward, f)

    print("Saving Index:")
    with open(parent_path + method_name + "veh_drive.pl", 'wb') as f:
        print("recording veh_drive..")
        pickle.dump(records_of_veh_drive, f)

    with open(parent_path + method_name + "veh_drive_speed.pl", 'wb') as f:
        print("recording veh_drive_speed..")
        pickle.dump(records_of_veh_drive_speed, f)

    with open(parent_path + method_name + "se_delay.pl", 'wb') as f:
        print("recording se_delay..")
        pickle.dump(records_of_se_delay, f)

    with open(parent_path + method_name + "se_SR.pl", 'wb') as f:
        print("recording se_SR..")
        pickle.dump(records_of_se_SR, f)

    # print(veh_actions_record)
    # print(se_actions_record)
    print(seenv.SE_data_list)
Beispiel #26
0
def experiences_page():
    if 'username' in session:
        if request.method == 'GET':
            experiences = app.store.get_experiences()
            now = datetime.datetime.now()
            with dbapi2.connect(app.config['dsn']) as connection:
                cursor = connection.cursor()
                cursor.execute("DELETE FROM TOPMEMBERS")
                connection.commit()
            with dbapi2.connect(app.config['dsn']) as connection:
                cursor = connection.cursor()
                cursor.execute(
                    "(select userid, count(userid) from members inner join experience on(userid=memberid) group by userid limit 5) order by count(userid) desc"
                )
                cr = cursor.fetchall()
                topmembers = [(row[0], row[1]) for row in cr]
                connection.commit()
            with dbapi2.connect(app.config['dsn']) as connection:
                cursor = connection.cursor()
                for userid, count in topmembers:
                    query = "INSERT INTO TOPMEMBERS (USERID, COUNT) VALUES ( %s, %s)"
                    cursor.execute(query, (userid, count))
                counter = 0
                for userid, count in topmembers:
                    cursor.execute(
                        "select username from members where memberid='%s';" %
                        userid)
                    user = cursor.fetchone()
                    topmembers[counter] = user[0], count * 10
                    counter = counter + 1
                connection.commit()
                return render_template('experiences.html',
                                       experiences=experiences,
                                       topmembers=topmembers,
                                       current_time=now.ctime())
            with dbapi2.connect(app.config['dsn']) as connection:
                cursor = connection.cursor()
                cursor.execute("""DROP TABLE TOPMEMBERS""")
                connection.commit()

        elif 'experiences_to_delete' in request.form or 'search' in request.form:
            if request.form['submit'] == 'Delete':
                keys = request.form.getlist('experiences_to_delete')
                for key in keys:
                    app.store.delete_experience(int(key))
                return redirect(url_for('experiences_page'))
            elif request.form['submit'] == 'Search':
                keyword = request.form['search']
                experiences = app.store.search_experience(keyword)
                now = datetime.datetime.now()
                return render_template('experiences.html',
                                       experiences=experiences,
                                       current_time=now.ctime())
        else:

            title = request.form['title']
            start = request.form['start']
            finish = request.form['finish']
            period = request.form['period']
            length = request.form['length']
            name = session['username']
            experience = Experience(title, name, start, finish, period, length)
            app.store.add_experience(experience)
            with dbapi2.connect(app.config['dsn']) as connection:
                cursor = connection.cursor()
                cursor.execute(
                    "UPDATE  MEMBERS SET SCORE=SCORE+10 WHERE username='******';" %
                    name)
                connection.commit()

            with dbapi2.connect(app.config['dsn']) as connection:
                cursor = connection.cursor()
                cursor.execute(
                    "SELECT memberid FROM MEMBERS WHERE username='******';" % name)
                id = cursor.fetchone()
                connection.commit()
            with dbapi2.connect(app.config['dsn']) as connection:
                cursor = connection.cursor()
                query = ("UPDATE EXPERIENCE SET userid=%s  WHERE (id=%s)")
                key = app.store.exp_key
                cursor.execute(query, (id[0], key))
                connection.commit()
            now = datetime.datetime.now()
            return redirect(url_for('experience_page', key=app.store.exp_key))
    else:
        return redirect(url_for('guest_page'))
Beispiel #27
0
        from visdom import Visdom
        viz = Visdom()

    # Build Environment Template -> Lazy Evaluated Callable, for spawning environments
    env_template = build_env(args.env)

    # Build Distributed Environments
    envs = get_distributed_backend(env_template,
                                   args.num_processes,
                                   backend=args.distributed_backend)

    # Obtain Environment metadata
    metadata = envs.get_metadata()

    # Instantiate Policy
    policy = get_policy(args.policy, metadata)

    # Create agent, with the given training algorithm
    agent = get_algorithm(args.algorithm, policy, envs, args, visdom=viz)

    # Create Experience Buffer, with the environment metadata
    experience = Experience(metadata['max_episode_length'], args.num_processes,
                            metadata['obs_shape'], metadata['action_type'],
                            metadata['action_shape'])

    # Train agent
    agent.train(num_frames=args.num_frames)

    import IPython
    IPython.embed()
 def null_experience_list(self, count=100):
     return [Experience(None, None, None, None, None) for _ in range(count)]
Beispiel #29
0
    def __init__(self, env, task, visualise):
        self.env = env
        self.task = task
        self.ob_shape = [HEIGHT, WIDTH, CHANNEL]
        self.action_n = Environment.get_action_size()
        # define the network stored in ps which is used to sync
        worker_device = '/job:worker/task:{}'.format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope('global'):
                self.experience = Experience(
                    EXPERIENCE_HISTORY_SIZE)  # exp replay pool
                self.network = UnrealModel(self.action_n, self.env,
                                           self.experience)
                self.global_step = tf.get_variable('global_step',
                                                   dtype=tf.int32,
                                                   initializer=tf.constant(
                                                       0, dtype=tf.int32),
                                                   trainable=False)
        # define the local network which is used to calculate the gradient
        with tf.device(worker_device):
            with tf.variable_scope('local'):
                self.local_network = net = UnrealModel(self.action_n, self.env,
                                                       self.experience)
                net.global_step = self.global_step

            # add summaries for losses and norms
            self.batch_size = tf.to_float(tf.shape(net.base_input)[0])
            base_loss = self.local_network.base_loss
            pc_loss = self.local_network.pc_loss
            rp_loss = self.local_network.rp_loss
            vr_loss = self.local_network.vr_loss
            entropy = tf.reduce_sum(self.local_network.entropy)
            self.loss = base_loss + pc_loss + rp_loss + vr_loss
            grads = tf.gradients(self.loss, net.var_list)
            tf.summary.scalar('model/a3c_loss', base_loss / self.batch_size)
            tf.summary.scalar('model/pc_loss', pc_loss / self.batch_size)
            tf.summary.scalar('model/rp_loss', rp_loss / self.batch_size)
            tf.summary.scalar('model/vr_loss', vr_loss / self.batch_size)
            tf.summary.scalar('model/grad_global_norm', tf.global_norm(grads))
            tf.summary.scalar('model/var_global_norm',
                              tf.global_norm(net.var_list))
            tf.summary.scalar('model/entropy', entropy / self.batch_size)
            tf.summary.image('model/state', net.base_input)
            self.summary_op = tf.summary.merge_all()

            # clip the gradients to avoid gradient explosion
            grads, _ = tf.clip_by_global_norm(grads, GRAD_NORM_CLIP)

            self.sync = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(net.var_list, self.network.var_list)
            ])
            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(tf.to_int32(
                self.batch_size))
            lr = log_uniform(LR_LOW, LR_HIGH)
            opt = tf.train.RMSPropOptimizer(learning_rate=lr,
                                            decay=RMSP_ALPHA,
                                            momentum=0.0,
                                            epsilon=RMSP_EPSILON)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)
            self.summary_writer = None
            self.local_step = 0
Beispiel #30
0
    def train(self,
              name_scope,
              action_verbose=False,
              update_interval=20,
              graph_interval=50,
              monitor_interval=100,
              discount_rate=0.99,
              obs_verbose=False,
              reward_verbose=False):
        online_q_values, online_vars, online_input = self.q_network(
            name_scope="online")
        target_q_values, target_vars, target_input = self.q_network(
            name_scope="target")

        copy_ops = [
            target_var.assign(online_vars[var_name])
            for var_name, target_var in target_vars.items()
        ]
        copy_online_to_target = tf.group(*copy_ops)

        with self.graph.as_default() as graph:
            with tf.variable_scope('train', reuse=tf.AUTO_REUSE):
                X_action = tf.placeholder(dtype=tf.int32,
                                          shape=[None, 1],
                                          name='X_action')
                max_reward = tf.placeholder(dtype=tf.float32,
                                            shape=[None, 1],
                                            name='max_reward')
                # expected Bellman reward; self.q variable takes a one-hot encoding of output-action and multiplies
                # by policy net output to determine q-value for given state(s) over all n actions
                # by policy net output to determine q-value for given state(s) over all actions
                q = tf.reduce_sum(online_q_values *
                                  tf.one_hot(X_action, self.n_actions),
                                  axis=1,
                                  keep_dims=True)
                loss = tf.square((max_reward - q))
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)
                train_opt = optimizer.minimize(
                    loss, var_list=tf.trainable_variables(
                        scope='online'))  #important to specify var_list
                tf.add_to_collection(tf.GraphKeys.TRAIN_OP, train_opt)
                self.saver = tf.train.Saver()
                init = tf.global_variables_initializer()

        # Some initialization steps
        train_sess = tf.InteractiveSession(graph=self.graph)
        train_sess.run(init)
        self.replay.clear()
        assert tf.get_default_graph() is self.graph

        # Monitor game by game progress
        agg_rewards = []
        step_agg_rewards = []
        game_steps = []

        # Outer loop for number of games to play
        for i in range(self.num_games_train):
            print('==========================\nGame #%d begun' % (i + 1))
            game_step = 0
            init_state = self.env.reset()
            init_state = preprocess_observation(init_state)
            next_state = None
            done = False
            agg_reward = 0
            frame_count = 0  #Total number of frames encounted in game
            last_frame = None  #variable used for pixelwise max operation
            maxop = True  #take framewise max of num_frames neighbors
            self.frame_cache = FrameCache(size=self.num_frames)

            # This while loop is **like** each step in the game (not really because we're using multiple frames)
            # Frames go like x1,x2,x3,x4 then x2,x3,x4,x5 ... end of game but skip between
            while done == False:
                game_step += 1
                if i % (render_interval) == 0:
                    self.env.render()
                frame_skip = False  #initialize skipping to False
                # q_values = online_q_values.eval(feed_dict={online_input: [next_state],target_input: None})
                # action = self.epsilon_greedy(q_values)
                init_state = np.reshape(init_state, [
                    1, init_state.shape[0], init_state.shape[1],
                    init_state.shape[2]
                ])

                qs = q.eval(
                    feed_dict={
                        online_input: init_state,
                        target_input: np.zeros_like(init_state),
                        X_action: [[0]],
                        max_reward: [[0]]
                    })
                action = self.epsilon_greedy(qs,
                                             epsilon=0.6,
                                             iteration=i,
                                             k=1000,
                                             show_eps=True)
                action = self.epsilon_greedy(qs)
                next_state, reward, done, info = self.env.step(action)
                next_state = preprocess_observation(next_state)

                # metrics for monitoring improvement
                agg_reward += reward
                step_agg_reward = agg_reward / game_step

                if game_step % monitor_interval == 0:
                    print('Aggregate reward at step %d: %f' %
                          (game_step, agg_reward))
                    print('Step Aggregate Reward: %f' % step_agg_reward)

                # experience replay addition
                if maxop == True and last_frame is not None:
                    next_state = np.maximum(
                        last_frame,
                        next_state)  #take element-wise max of two frames
                if not frame_skip:
                    self.frame_cache.add_base_frame(
                        next_state
                    )  #this is going to be the last index of frame stack

                    #maybe not the most efficient way to do this
                    for k in range(0, (self.frame_cache.len_stacks() - 1)):
                        self.frame_cache.add_to_stack(
                            next_state, k
                        )  #adds frame to every preceding stack if it's not full yet
                frame_count += 1

                #Flip frame_skip every num_frames interval (it's possible to change this interval but let's keep it simple)
                if frame_count % self.num_frames == 0:
                    if frame_skip == True:
                        frame_skip = False
                    else:
                        frame_skip = True

                last_frame = next_state  #now this frame is the last state
                init_state = next_state

                if self.frame_cache.len_stacks(
                ) > self.num_frames:  #at least num_frames actions must be taken before one stack frame is full
                    first_state_frames, second_state_frames = self.frame_cache.get_last_fulls(
                    )
                    if first_state_frames is not None and second_state_frames is not None:
                        exp_input = Experience(first_state_frames, action,
                                               reward, second_state_frames,
                                               done)
                        self.replay.add(exp_input)
            if i % render_interval == 0:
                self.env.close()

            agg_rewards.append(agg_reward)
            step_agg_rewards.append(step_agg_reward)
            game_steps.append(game_step)

            samples = self.replay.sample_batch(
                batch_size=1
            )  #try doing different batch sizes like Ryan said as parallelization, if batch_size>1 flatten this
            for exp in samples:
                # Experience Fields
                # self.first_state
                # self.action
                # self.reward
                # self.second_state
                # self.terminal

                fed = {target_input: exp.first_state}
                target_qs = target_q_values.eval(feed_dict=fed)

                max_next_q_values = np.max(target_qs, axis=1, keepdims=True)
                if exp.terminal:  #done == True
                    y_val = exp.reward
                else:
                    y_val = exp.reward + discount_rate * max_next_q_values

                print('\n\nGame ', i)
                print('Step ', game_step)
                print('y_val')
                print(y_val)
                print('\ntarget_qs')
                print(target_qs)
                print('\nmax_next_qs')
                print(max_next_q_values)
                print('\n\n')

                outs, out_loss, _ = train_sess.run(
                    [q, loss, train_opt],
                    feed_dict={
                        max_reward: max_next_q_values,
                        X_action: [[exp.action]],
                        online_input: exp.first_state,
                        target_input: exp.first_state
                    })

                if action_verbose:
                    print('Mean output loss at terminal step %d: %f' %
                          (game_step, np.mean(out_loss)))

            if i % update_interval == 0:
                train_sess.run(copy_online_to_target)
                print('Online to target copy complete after game %d' % i)

            if i % graph_interval == 0 and i > 1:
                f, (ax1, ax2, ax3) = plt.subplots(3)
                xs = np.arange(0, len(agg_rewards))
                ax1.set_title('Aggregate Reward Update at Game %d' % i)
                ax2.set_title('Step Aggregate Reward Update at Game %d' % i)
                ax3.set_title('Game Steps Used Update at Game %d' % i)
                ax1.plot(xs, agg_rewards, 'r', label='Agg Rewards')
                ax2.plot(xs, step_agg_rewards, 'b', label='Step Agg Rewards')
                ax3.plot(xs, game_steps, 'y', label='Game Steps')
                ax1.set_xlabel('Game Played')
                ax1.set_ylabel('Agg Reward')
                ax2.set_xlabel('Game Played')
                ax2.set_ylabel('Step Agg Reward')
                ax3.set_xlabel('Game Played')
                ax3.set_ylabel('Game Steps Used')
                f.tight_layout()
                plt.savefig('Metrics Measurement.png')
                plt.show(block=False)
                time.sleep(15)
                plt.close()
        self.saver.save(train_sess, self.ckptdir)
        print('Model saved at %s' % self.ckptdir)