Beispiel #1
0
    for i in range(num_runs):
        vel = np.zeros(num_steps)
        # logging.info("Iter #" + str(i))
        print('episode is:', i)
        ret = 0
        ret_list = []
        state = env.reset()
        aset = []
        for j in range(num_steps):

            # manager actions

            # one controllers
            a = agent.choose_action([state])
            aset.append(a)
            next_state, reward, done, _ = env.step(a)
            # if j%300==0:
            #     print(state)
            #     print(a)

            ret += reward
            ret_list.append(reward)

            if agent.num_experience > 2000:
                ploss, qloss, reg_loss, Q = agent.learn(batch_size=32)

            if done:
                agent.remember(state, a, reward, next_state, 1.)
                break
            agent.remember(state, a, reward, next_state, 0.)
            state = next_state[:]
Beispiel #2
0
            action_dict = {}

            a = sess.run(agent.action,
                         feed_dict={
                             agent.adj: [adj],
                             agent.state_holder: state_,
                             agent.vecholder: np.asarray([vec])
                         })

            k = 0
            for key, value in state.items():
                action_dict[key] = a[k]
                k += 1
            aset.append(a)

            next_state, reward, done, _ = env.step(action_dict)
            next_state_ = np.array(list(next_state.values())).reshape(
                1, -1).tolist()
            rewards = list(reward.values())
            ## calculate individual reward
            # for k in range(len(rewards)):
            #     print('agent',k,'reward is:',rewards[k])

            ret += np.average(
                rewards)  ##here we consider the rewards of each agent
            ret_list.append(rewards)

            agent.remember(state_, a, rewards, next_state_, 0., adj)
            if agent.num_experience > 200:  ##could change to 200
                ploss, qloss = agent.learn(batch_size=32)