Exemple #1
0
 def __init__(self, name, globalAC, config_a, config_c):
     self.name = name
     #self.globalAC = globalAC
     #self.globalAC.load_ckpt()
     self.AC = ACnet(name, globalAC, config_a, config_c)
     globalAC.load_ckpt()
     self.AC.pull_global()
     self.env = wrap()
def test():
    from config_a3c import config_a, config_c
    ac = ACnet("Global_Net", None,  config_a, config_c)  # we only need its params
    ac.load_ckpt()
    env = wrap(game[0])
    state, _, done, info = env.reset()
    while True:
        a0, a1, a2 = ac.choose_action([state], [info])
        action = 1 if a0 == 0 else int(2 + a1 * scr_pixels + a2)
        state, reward, done, info = env.step(action)
        if done:
            state, _, done, info = env.reset()
Exemple #3
0
def main():
    global env

    env = wrap()
    state_shape = env.state_shape()
    action_shape = env.action_shape()
    from config import config

    dqn = Model(config, state_shape, action_shape, FLAGS.learning_rate,
                FLAGS.gamma, FLAGS.save_dir)

    mem = Memory(FLAGS.mem_size)
    explorer = Explorer(FLAGS.explore_step, 0.01, 1.0, action_shape[1][0],
                        action_shape[1][1], action_shape[0], 0.2)

    done = True
    step = dqn.get_step()

    while step < FLAGS.number_steps:
        if done:
            state, _, _, info = env.reset()
            #print(state.shape)
        q_map, q_other, step = dqn.start_infer(state)
        action = explorer.make_action(step, q_map, q_other, FLAGS.test)

        next_state, reward, done, info = env.step(action)
        mem.enqueue(state, action, reward, next_state, float(done))

        state = next_state

        if FLAGS.test or (step < FLAGS.training_start) or (
                step % FLAGS.learn_freq != 0):
            # print('action: %d, reward: %f, q_noop: %f, q_select: %f'%(action, reward, q_other[0], q_other[1]))

            continue

        # train
        inputs = mem.sample(FLAGS.batch_size)
        loss, step = dqn.start_train(inputs)
        print(q_other.shape)
        print(
            'q_max: %f, q_min: %f, q_noop: %f, q_select: %f, action: %d, reward: %f'
            %
            (max(q_map.max(), q_other.max()), min(q_map.min(), q_other.min()),
             q_other[0][0], q_other[0][1], action, reward))
        print('step: %d, loss: %f' % (step, loss))

        if step % FLAGS.save_steps == 0:
            print('')
            print('model saved. step: %d' % step)
            dqn.save(step)
Exemple #4
0
def main():
    global env

    env = wrap()
    state_shape = env.state_shape()
    action_shape = env.action_shape()
    from config import config

    dqn = Model(config, state_shape, action_shape,
                FLAGS.learning_rate, FLAGS.gamma, FLAGS.save_dir)
    explorer = Explorer(FLAGS.explore_step,
                        0.01, 1.0, action_shape[1][0], action_shape[1][1],
                        action_shape[0], 0.2)

    done = True
    step = dqn.get_step()

    while step < FLAGS.number_steps:
        if done:
            state, _, _, info = env.reset()
            act_rem = None
            #print(state.shape)
        q_map, q_other, step = dqn.start_infer(state)
        action = act_rem if act_rem != None else explorer.make_action(step, q_map, q_other, FLAGS.test)

        next_state, reward, done, info = env.step(action)
        next_q_map, next_q_other, step = dqn.predict_infer(state)
        act_rem = explorer.make_action(step, next_q_map, next_q_other, FLAGS.test)

        inputs = [[state],[action],[reward],[next_state],[act_rem],[done]]

        loss, step = dqn.start_train(inputs)
        print(q_other.shape)
        print('q_max: %f, q_min: %f, q_noop: %f, q_select: %f, action: %d, reward: %f' % (
        max(q_map.max(), q_other.max()), min(q_map.min(), q_other.min()), q_other[0][0], q_other[0][1], action, reward))
        print('step: %d, loss: %f' % (step, loss))

        if step % FLAGS.save_steps == 0:
            print('')
            print('model saved. step: %d' % step)
            dqn.save(step)
Exemple #5
0
    def pre_train(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        # self.AC.pull_global()
        total_step = 1
        buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail,buffer_a0_exp,buffer_a1_exp,buffer_a2_exp = [], [], [], [], [], [],[],[],[]
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            state, _, _, info = self.env.reset(
            )  # timestep[0] contains rewards, observations, etc. SEE pysc2 FOR MORE INFO
            ep_r = 0
            lei_ji = 0
            while True:
                a0, a1, a2 = self.AC.choose_action([state], [info])
                a0_exp, a1_exp, a2_exp = teacher.action(state, info)
                # print(state)
                action = 1 if a0 == 0 else int(2 + a1_exp * scr_pixels +
                                               a2_exp)
                buffer_s.append([state])
                buffer_avail.append([info])
                buffer_a0.append(a0)
                buffer_a1.append(a1)
                buffer_a2.append(a2)
                buffer_a0_exp.append(a0_exp)
                buffer_a1_exp.append(a1_exp)
                buffer_a2_exp.append(a2_exp)
                state, reward, done, info = self.env.step(action)
                lei_ji += reward
                if lei_ji >= 20:
                    done = True
                if reward > 0:
                    reward = reward * (1 + ep_r * weight)
                buffer_r.append(reward)
                ep_r += reward
                if total_step % UPDATE_GLOBAL_ITER == 0 or done:
                    if done:
                        v_s_ = 0
                    else:
                        v_s_ = sess.run(self.AC.value, {self.AC.s: [state]})[0,
                                                                             0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_  # compute v target
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_v_target, buffer_avail, buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = np.vstack(
                        buffer_s
                    ), np.vstack(buffer_a0), np.vstack(buffer_a1), np.vstack(
                        buffer_a2), np.vstack(buffer_v_target), np.vstack(
                            buffer_avail), np.vstack(buffer_a0_exp), np.vstack(
                                buffer_a1_exp), np.vstack(
                                    buffer_a2_exp
                                )  # put together into a single array
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a0: buffer_a0,
                        self.AC.a1: buffer_a1,
                        self.AC.a2: buffer_a2,
                        self.AC.a0_exp: buffer_a0_exp,
                        self.AC.a1_exp: buffer_a1_exp,
                        self.AC.a2_exp: buffer_a2_exp,
                        self.AC.v_target: buffer_v_target,
                        self.AC.available: buffer_avail,
                    }
                    test = self.AC.update_global_high(
                        feed_dict)  # update parameters
                    #closs ,aloss,exp_loss= sess.run([self.AC.c_loss,self.AC.a_loss,self.AC.exp_loss], feed_dict=feed_dict)
                    #print("c_loss:",closs,"a_loss:",aloss,"exp_loss",exp_loss)
                    #sigma_1,sigma_2 = sess.run([self.AC.sigma_1,self.AC.sigma_2],feed_dict = feed_dict)
                    entropy, aloss, td, exp_loss, prob_a = sess.run(
                        [
                            self.AC.entropy, self.AC.a_loss, self.AC.td,
                            self.AC.exp_loss, self.AC.log_prob_a
                        ],
                        feed_dict=feed_dict)

                    buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], []
                    buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = [], [], []
                    self.AC.pull_global()

                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] +
                                                0.05 * ep_r)
                    print(
                        self.name,
                        "episode:",
                        GLOBAL_EP,
                        '| reward: %.1f' % lei_ji,
                        "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1],
                        # '| sigma:', test, # debug
                    )
                    GLOBAL_EP += 1
                    print("entropy", entropy[0][0], "td", td[0], "prob_a:",
                          prob_a, "prob_exp:", exp_loss, "aloss", aloss)
                    # self.globalAC.save_ckpt()
                    # with open("/summary.txt",'w') as f:
                    #    f.write('%.lf' % ep_r)
                    if ep_r > score_high[self.hard] or ep_r < score_low[
                            self.hard]:
                        self.env.close()
                        self.hard = self.hard + 1 if ep_r > score_high[
                            self.hard] else self.hard - 1
                        self.env = wrap(game[self.hard])
                    break
Exemple #6
0
 def __init__(self, name, globalAC, config_a, config_c):
     self.env = wrap(game)
     self.globalAC = globalAC
     self.name = name
     self.AC = ACnet(name, globalAC, config_a, config_c)
Exemple #7
0
 def __init__(self):
     self.env = wrap()