def test_vpg_agent(self):
        config = {
            'batch_size': 8,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2,),
            'actions': 2}
        tf.reset_default_graph()

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense', 'num_outputs': 32}])

        agent = VPGAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in range(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state, action=action, reward=reward, terminal=terminal)
            rewards[n % 100] = reward
            if sum(rewards) == 100.0:
                return
        self.assertTrue(False)
    def test_baseline(self):
        config = Configuration(discount=0.75,
                               batch_size=8,
                               learning_rate=0.001,
                               states=dict(shape=(1, )),
                               actions=dict(continuous=True),
                               network=layered_network_builder(()))
        agent = VPGAgent(config=config)

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        discounted_rewards = np.array([
            0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2,
            0.75, 1.0, 0.0
        ])
        baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0])
        agent.model.baseline = dict(state=Baseline())
        agent.model.baseline['state'].predict = lambda states: baseline

        result, _ = agent.model.reward_estimation(states=dict(state=states),
                                                  rewards=rewards,
                                                  terminals=terminals)
        expected = discounted_rewards - baseline
        print(result)
        print(expected)
        self.assertTrue((result == expected).all())
    def test_multithreaded(self):
        sys.stdout.write('\nVPGAgent (multithreaded):')
        sys.stdout.flush()

        environment = MinimalTest(specification={'int': ()})

        network = [dict(type='dense', size=32), dict(type='dense', size=32)]
        kwargs = dict(update_mode=dict(unit='episodes',
                                       batch_size=4,
                                       frequency=4),
                      memory=dict(type='latest',
                                  include_next_states=False,
                                  capacity=100),
                      optimizer=dict(type='adam', learning_rate=1e-2))
        agent = VPGAgent(states=environment.states,
                         actions=environment.actions,
                         network=network,
                         **kwargs)

        agents = clone_worker_agent(agent, 5, environment, network, kwargs)
        environments = [environment
                        ] + [copy.deepcopy(environment) for n in range(4)]

        runner = ThreadedRunner(agent=agents, environment=environments)

        runner.run(num_episodes=100)
        runner.close()

        sys.stdout.write(' ran\n')
        sys.stdout.flush()
    def test_beta(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            actions = environment.actions
            actions['min_value'] = -0.5
            actions['max_value'] = 1.5

            config = Configuration(batch_size=8,
                                   learning_rate=0.01,
                                   states=environment.states,
                                   actions=actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1500, episode_finished=episode_finished)
            print('VPG agent (beta): ' + str(runner.episode))
            if runner.episode < 1500:
                passed += 1

        print('VPG agent (beta) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('VPG agent (continuous): ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('VPG agent (continuous) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Exemple #6
0
    def test_discrete_baseline(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   baseline=dict(type="mlp",
                                                 sizes=[32, 32],
                                                 epochs=5,
                                                 update_batch_size=8,
                                                 learning_rate=0.01),
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1500, episode_finished=episode_finished)
            print('VPG agent (discrete): ' + str(runner.episode))

            if runner.episode < 1500:
                passed += 1

        print('VPG agent (discrete) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def test_multi(self):
        passed = 0

        def network_builder(inputs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            state2 = layer(x=layer(x=inputs['state2'], size=32), size=32)
            state3 = layer(x=layer(x=inputs['state3'], size=32), size=32)
            return state0 * state1 * state2 * state3

        for _ in xrange(5):
            environment = MinimalTest(definition=[
                False, (False, 2), (False, (1, 2)), (True, (1, 2))
            ])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 50 or not all(
                    x >= 1.0 for x in r.episode_rewards[-50:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('VPG agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('VPG agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Exemple #8
0
def get_agent(agentType):

    if agentType == "dqn":
        agent = DQNAgent(
            states={
                "type": 'float',
                "shape": (
                    int(args.population),
                    1,
                    int(args.resources),
                )
            },
            actions={
                "type": 'int',
                "shape": (int(args.resources), ),
                "num_values": 3
            },
            memory=1000,
            network="auto",
        )
    elif agentType == "vpg":
        agent = VPGAgent(
            states={
                "type": 'float',
                "shape": (
                    int(args.population),
                    1,
                    int(args.resources),
                )
            },
            actions={
                "type": 'int',
                "shape": (int(args.resources), ),
                "num_values": 3
            },
            network="auto",
            memory=1000,
        )
    elif agentType == "trpo":
        agent = TRPOAgent(
            states={
                "type": 'float',
                "shape": (
                    int(args.population),
                    1,
                    int(args.resources),
                )
            },
            actions={
                "type": 'int',
                "shape": (int(args.resources), ),
                "num_values": 3
            },
            network="auto",
            memory=1000,
        )

    return agent
Exemple #9
0
    def test_multi_baseline(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'],
                                   size=32,
                                   scope='state0-1'),
                           size=32,
                           scope='state0-2')
            state1 = layer(x=layer(x=inputs['state1'],
                                   size=32,
                                   scope='state1-1'),
                           size=32,
                           scope='state1-2')
            state2 = layer(x=layer(x=inputs['state2'],
                                   size=32,
                                   scope='state2-1'),
                           size=32,
                           scope='state2-2')
            return state0 * state1 * state2

        for _ in xrange(5):
            environment = MinimalTest(
                definition=[False, (False, 2), (True, 2)])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   baseline=dict(type="mlp",
                                                 sizes=[32, 32],
                                                 epochs=5,
                                                 update_batch_size=8,
                                                 learning_rate=0.01),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=4000, episode_finished=episode_finished)
            print('VPG agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 4000:
                passed += 1

        print('VPG agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Exemple #10
0
    def __init__(self, state_size, env=None, is_eval=False):
        self.state_size = state_size
        self.action_size = 3
        self.memory_size = 1000
        self._memory = deque(maxlen=1000)
        self.inventory = pd.DataFrame(columns=['Price', 'POS', 'Order'])
        self.is_eval = is_eval
        self.learning_rate = env.learning_rate
        self.gamma = env.gamma
        self.env = env

        self.up = dict(batch_size = self.env.batch_size,
                       frequency = self.env.batch_size)

        VPGAgent.__init__(self,
                           states=dict(type='float', shape=self.state_size.shape),
                           actions=dict(type='int', num_actions=self.action_size),
                           network=self.get_network(),
                           update_mode=self.up,
                           batching_capacity=self.memory_size,
                           learning_rate=self.learning_rate,
                           discount=self.gamma)

        self._load_model()
    def test_gae(self):
        config = Configuration(discount=0.75,
                               batch_size=8,
                               learning_rate=0.001,
                               gae_rewards=True,
                               gae_lambda=0.5,
                               states=dict(shape=(1, )),
                               actions=dict(continuous=True),
                               network=layered_network_builder(()))
        agent = VPGAgent(config=config)

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0])
        agent.model.baseline = dict(state=Baseline())
        agent.model.baseline['state'].predict = lambda states: baseline
        td_residuals = np.array([
            0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.75 * 0.25, 0.75 * 0.5 - 0.25, 1.0,
            1.0 + 0.75 * 0.25 - 0.5, 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.0
        ])

        result, _ = agent.model.reward_estimation(states=dict(state=states),
                                                  rewards=rewards,
                                                  terminals=terminals)
        expected = np.array([
            np.sum(
                ((0.5 * 0.75)**np.array([0, 1, 2, 3, 4])) * td_residuals[:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[1:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[2:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[3:5]),
            np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[4:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[5:]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[6:]),
            np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[7:]),
            np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[8:])
        ])
        self.assertTrue((result == expected).all())
Exemple #12
0
        for i in range(int(len(infrastructure.peers) * .1))
    },
    network=[
        dict(type='flatten'),
        dict(type='dense', size=32, activation='relu'),
    ],
)

# Create a Vanilla Policy Gradient
agent_vpg = VPGAgent(
    states={
        "type": 'float',
        "shape": infrastructure.get_state().shape
    },
    actions={
        str(i): dict(type="int", num_actions=len(infrastructure.peers))
        for i in range(int(len(infrastructure.peers) * .1))
    },
    network=[
        dict(type='flatten'),
        dict(type='dense', size=32, activation='relu'),
    ],
)

#agent_ppo.restore_model("results/client-server")

print("agents made")

monkey = []
rl_ppo = []
rl_dqn = []
rl_vpg = []
Exemple #13
0
    # DistributionModel
    distributions_spec=None,
    entropy_regularization=0.01,
    # PGModel
    baseline_mode=None,
    baseline=None,
    baseline_optimizer=None,
    gae_lambda=None,
    # PGLRModel
    likelihood_ratio_clipping=0.2,
    summary_spec=None,
    distributed_spec=None
)'''

agent = VPGAgent(states_spec=env.states,
                 actions_spec=env.actions,
                 network_spec=network_spec,
                 batch_size=10)

# Create the runner
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):
    print(
        "Finished episode {ep} after {ts} timesteps (reward: {reward})".format(
            ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1]))
    return True


# Start learning
Exemple #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--relation', help="Number of episodes")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=500,
                        help="Number of episodes")
    parser.add_argument('-a',
                        '--agent',
                        type=str,
                        default='vpg',
                        help="VPG or DQN Agent")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()
    print("Running DeepPath-TensorForce")

    if args.relation:  # relation is defined
        relation = args.relation
        logger.info('Relation set to %s', relation)
    else:
        logger.error("Error : No Relation name provided!")
        return

    graphPath = dataPath + 'tasks/' + relation + '/' + 'graph.txt'
    relationPath = dataPath + 'tasks/' + relation + '/' + 'train_pos'
    if not os.path.exists(relationPath):
        logger.info('Incorrect relation specified  %s', relation)
        print('Incorrect relation specified ', relation)
    f = open(relationPath)
    data = f.readlines()
    f.close()

    # Initialize the DeePath Environment class
    environment = DPEnv(graphPath, relationPath, task=data)

    network_spec = [
        dict(type='dense', size=512, activation='relu'),
        dict(type='dense', size=1024, activation='relu')
    ]

    step_optimizer = dict(type='adam', learning_rate=1e-3)
    agent = None

    if args.agent == 'vpg':
        logger.info('Initializing VPGAgent')
        agent = VPGAgent(states_spec=dict(shape=state_dim, type='float'),
                         actions_spec=dict(num_actions=action_space,
                                           type='int'),
                         network_spec=network_spec,
                         optimizer=step_optimizer,
                         discount=0.99,
                         batch_size=1000)
    elif args.agent == 'dqn':
        logger.info('Initializing DQNAgent')
        agent = DQNAgent(states_spec=dict(shape=state_dim, type='float'),
                         actions_spec=dict(num_actions=action_space,
                                           type='int'),
                         network_spec=network_spec,
                         optimizer=step_optimizer,
                         discount=0.99,
                         batch_size=1000)

    logger.info('Initializing Runner')
    runner = Runner(agent=agent, environment=environment)

    report_episodes = args.episodes / 50  # default episodes = 500

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info(
                "Finished episode {ep} after {ts} timesteps. Steps Per Second "
                .format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 50 rewards: {}".format(
                sum(r.episode_rewards[-50:]) / 50))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))
    print("Starting {agent} for Environment".format(agent=agent))
    runner.run(episodes=args.episodes,
               max_episode_timesteps=1,
               episode_finished=episode_finished)
    logger.info(
        "Learning finished. Total episodes: {ep}".format(ep=runner.episode))
    print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
    environment.close()
    if timestep % save_step == 0:
        plt.savefig('data/%s_%f_%d_%d_%d.jpg' %
                    (model_name, learning_rate, MAXSTEPS, LAYER_1, LAYER_2))
        plt.show()


if __name__ == '__main__':
    env = OpenAIGym('Acrobot-v1', visualize=False)
    LAYER_1 = 128
    LAYER_2 = 64
    observation = env.reset()
    VPG_agent = VPGAgent(
        states=dict(type='float', shape=env.states['shape']),
        actions=dict(type='int', num_actions=env.actions['num_actions']),
        # discrete action space but continuous state space
        network=[
            dict(type='dense', size=LAYER_1,
                 activation='relu'),  # changed to tanh for best
            dict(type='dense', size=LAYER_2, activation='relu')
        ],
        optimizer=dict(type='adam', learning_rate=learning_rate))

    # Create a Proximal Policy Optimization agent
    PPO_agent = PPOAgent(
        states=dict(type='float', shape=env.states['shape']),
        actions=dict(type='int', num_actions=env.actions['num_actions']),
        # discrete action space but continuous state space
        network=[
            dict(type='dense', size=LAYER_1,
                 activation='relu'),  #changed to tanh for best
            dict(type='dense', size=LAYER_2, activation='relu')
        ],
        self.gym = gym
        self.visualize = False


env = ConcatStates(env)
environment = TFOpenAIGymCust('CryptoPortfolioEIIE-v0', env)

env.seed(0)
state = environment.reset()
state, done, reward = environment.execute(env.action_space.sample())

network_spec = [dict(type='dense', size=16), dict(type='dense', size=10)]

agent = VPGAgent(states_spec=environment.states,
                 actions_spec=environment.actions,
                 batch_size=20,
                 network_spec=network_spec,
                 discount=0.8,
                 optimizer=dict(type='adam', learning_rate=1e-4))

runner = Runner(agent=agent, environment=environment, repeat_actions=1)

report_episodes = 100

print("Starting {agent} for Environment '{env}'".format(agent=agent,
                                                        env=environment))

pvs = []


def episode_finished(r):
    if r.episode % report_episodes == 0:
def get_agent(game, agentType):
    count = 1

    base_path = '.'
    checkpointPath = base_path + "/games/agents/" + game + "/" + agentType + "/"

    if agentType == "vpg":
        agent = VPGAgent(
            states=config[game]["states"],
            actions=config[game]["actions"],
            memory=1000,
            network="auto",
        )
    elif agentType == "ppo":
        agent = PPOAgent(
            states=config[game]["states"],
            actions=config[game]["actions"],
            memory=1000,
            network="auto",
        )
    elif agentType == "dqn":
        agent = DQNAgent(
            states=config[game]["states"],
            actions=config[game]["actions"],
            memory=1000,
            network="auto",
        )

    if game == "3pd":
        try:
            agent.restore(directory=checkpointPath, filename=None)
            print("restoration successful")
        except Exception as e:
            agent.initialize()
            for x in tqdm(range(1000001)):
                testState = np.full(config[game]["states"]["shape"], None)

                for i in range(10):
                    moveA = agent.act(testState)
                    moveB = agent.act(testState)
                    moveC = agent.act(testState)
                    rewards = payoffs(game, [moveA, moveB, moveC])
                    if i < 9:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=False)
                        agent.observe(reward=rewards[2], terminal=False)
                    else:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=False)
                        agent.observe(reward=rewards[2], terminal=True)
                    testState[i] = [[moveA], [moveB], [moveC]]
                if x % 1000 == 0:
                    # checkpointPath = "../games/agents/" + game + "/" + agentType + "/"
                    agent.save(directory=checkpointPath, filename=None)
                    # print("saving successful")
    else:
        try:
            agent.restore(directory=checkpointPath, filename=None)
            print("restoration successful")
        except Exception as e:
            # try:
            # 	checkpointPath = base_path + "/agents/" + game + "/" + agentType + "/"
            # 	agent.restore(directory=checkpointPath, filename=None)
            # 	print("restoration successful after second attempt")
            # except Exception as e:
            # 	a = subprocess.check_output("ls games/", shell=True)
            # 	print(a)
            # 	print(os.getcwd(), "vs", subprocess.check_output("pwd", shell=True))
            # 	checkpointPath = "./games/agents/" + game + "/" + agentType + "/"
            # 	print(checkpointPath)
            # 	agent.restore(directory=checkpointPath, filename=None)
            # 	print("restoration successful after third attempt")
            agent.initialize()

            for x in tqdm(range(count)):

                testState = np.full(config[game]["states"]["shape"], 0)

                for i in range(10):
                    moveA = agent.act(testState)
                    moveB = agent.act(testState)
                    rewards = payoffs(game, [moveA, moveB])
                    if i < 10:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=False)
                    else:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=True)

                    testState[i] = [[moveA], [moveB]]
            checkpointPath = "./games/agents/" + game + "/" + agentType + "/"
            agent.save(directory=checkpointPath, filename=None)
            print("saving successful")

    return agent
    network=[dict(type='dense', size=64),
             dict(type='dense', size=64)],
    batching_capacity=1000,
    step_optimizer=dict(type='adam', learning_rate=1e-4))

# Create a Trust Region Policy Optimization agent
agentTRPO = TRPOAgent(
    states=dict(type='float', shape=env.observation_space.shape),
    actions=dict(type='int', num_actions=env.action_space.n),
    network=[dict(type='dense', size=64),
             dict(type='dense', size=64)])

# Create a Vanilla Policy Gradient agent
agentVPG = VPGAgent(
    states=dict(type='float', shape=env.observation_space.shape),
    actions=dict(type='int', num_actions=env.action_space.n),
    network=[dict(type='dense', size=64),
             dict(type='dense', size=64)])

# Add 3 random agents
agents = []
for agent_id in range(3):
    agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

# Add TensorforceAgent
agent_id += 1
agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
env.set_agents(agents)
env.set_training_agent(agents[-1].agent_id)
env.set_init_game_state(None)
            "down": dict(type="float", min_value=0.0, max_value=1.0),
            "left": dict(type="float", min_value=0.0, max_value=1.0),
            "right": dict(type="float", min_value=0.0, max_value=1.0),
        },
        network='auto',
        memory=10000,
    )

elif args.agent == "vpg":  # Vanilla Policy Gradient
    agent = VPGAgent(
        states={
            "type": 'float',
            "shape": (1, 610)
        },
        actions={
            "up": dict(type="float", min_value=0.0, max_value=1.0),
            "down": dict(type="float", min_value=0.0, max_value=1.0),
            "left": dict(type="float", min_value=0.0, max_value=1.0),
            "right": dict(type="float", min_value=0.0, max_value=1.0),
        },
        network='auto',
        memory=10000,
    )

else:
    print("Available agents: vpg, ppo, dqn")
    exit()

print("agent ready", agent)
agent.initialize()  # Set up base of agent

try:  # Looks to see if a saved model is available and loads it
        network=[
            dict(type='flatten'),
            dict(type="dense", size=32),
        ],
        memory=10000,
    )

elif args.agent == "vpg":
    agent = VPGAgent(
        states={
            "type": 'float',
            "shape": G.graph.shape
        },
        actions={
            "user": dict(type="int", num_values=G.graph.shape[0]),
            "item": dict(type="int", num_values=G.graph.shape[1])
        },
        network=[
            dict(type='flatten'),
            dict(type="dense", size=32),
        ],
        memory=10000,
    )
elif args.agent == "trpo":
    agent = TRPOAgent(
        states={
            "type": 'float',
            "shape": G.graph.shape
        },
        actions={
            "user": dict(type="int", num_values=G.graph.shape[0]),
Exemple #21
0
	    	str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(int(infrastructure.servers * .1))
	    },
	    network=[
		    dict(type='flatten'),
		    dict(type="dense", size=32),
		   	dict(type="dense", size=32),
		   	dict(type="dense", size=32)
	    ],
	)
elif args.monkey == "vpg":
	monkey = VPGAgent(
	    states={"type":'float', "shape": infrastructure.graph.shape },
	    actions={
	    	str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(int(infrastructure.servers * .1))
	    },
	    network=[
		    dict(type='flatten'),
		    dict(type="dense", size=32),
		   	dict(type="dense", size=32),
		   	dict(type="dense", size=32)
	    ],
	)

if args.manager == "ppo":
	manager = PPOAgent(
	    states={"type":'float', "shape": infrastructure.graph.shape },
	    actions={
	    	str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(infrastructure.clients)
	    },
	    network=[
		    dict(type='flatten'),
		    dict(type="dense", size=32),