def test_vpg_agent(self): config = { 'batch_size': 8, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2,), 'actions': 2} tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense', 'num_outputs': 32}]) agent = VPGAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in range(10000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return self.assertTrue(False)
def test_baseline(self): config = Configuration(discount=0.75, batch_size=8, learning_rate=0.001, states=dict(shape=(1, )), actions=dict(continuous=True), network=layered_network_builder(())) agent = VPGAgent(config=config) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] discounted_rewards = np.array([ 0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2, 0.75, 1.0, 0.0 ]) baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) agent.model.baseline = dict(state=Baseline()) agent.model.baseline['state'].predict = lambda states: baseline result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) expected = discounted_rewards - baseline print(result) print(expected) self.assertTrue((result == expected).all())
def test_multithreaded(self): sys.stdout.write('\nVPGAgent (multithreaded):') sys.stdout.flush() environment = MinimalTest(specification={'int': ()}) network = [dict(type='dense', size=32), dict(type='dense', size=32)] kwargs = dict(update_mode=dict(unit='episodes', batch_size=4, frequency=4), memory=dict(type='latest', include_next_states=False, capacity=100), optimizer=dict(type='adam', learning_rate=1e-2)) agent = VPGAgent(states=environment.states, actions=environment.actions, network=network, **kwargs) agents = clone_worker_agent(agent, 5, environment, network, kwargs) environments = [environment ] + [copy.deepcopy(environment) for n in range(4)] runner = ThreadedRunner(agent=agents, environment=environments) runner.run(num_episodes=100) runner.close() sys.stdout.write(' ran\n') sys.stdout.flush()
def test_beta(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) actions = environment.actions actions['min_value'] = -0.5 actions['max_value'] = 1.5 config = Configuration(batch_size=8, learning_rate=0.01, states=environment.states, actions=actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (beta): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (beta) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('VPG agent (continuous): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('VPG agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete_baseline(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, baseline=dict(type="mlp", sizes=[32, 32], epochs=5, update_batch_size=8, learning_rate=0.01), network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (discrete): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (discrete) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) state2 = layer(x=layer(x=inputs['state2'], size=32), size=32) state3 = layer(x=layer(x=inputs['state3'], size=32), size=32) return state0 * state1 * state2 * state3 for _ in xrange(5): environment = MinimalTest(definition=[ False, (False, 2), (False, (1, 2)), (True, (1, 2)) ]) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=network_builder) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 50 or not all( x >= 1.0 for x in r.episode_rewards[-50:]) runner.run(episodes=2000, episode_finished=episode_finished) print('VPG agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('VPG agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def get_agent(agentType): if agentType == "dqn": agent = DQNAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, memory=1000, network="auto", ) elif agentType == "vpg": agent = VPGAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, network="auto", memory=1000, ) elif agentType == "trpo": agent = TRPOAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, network="auto", memory=1000, ) return agent
def test_multi_baseline(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32, scope='state0-1'), size=32, scope='state0-2') state1 = layer(x=layer(x=inputs['state1'], size=32, scope='state1-1'), size=32, scope='state1-2') state2 = layer(x=layer(x=inputs['state2'], size=32, scope='state2-1'), size=32, scope='state2-2') return state0 * state1 * state2 for _ in xrange(5): environment = MinimalTest( definition=[False, (False, 2), (True, 2)]) config = Configuration(batch_size=8, learning_rate=0.001, baseline=dict(type="mlp", sizes=[32, 32], epochs=5, update_batch_size=8, learning_rate=0.01), states=environment.states, actions=environment.actions, network=network_builder) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=4000, episode_finished=episode_finished) print('VPG agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 4000: passed += 1 print('VPG agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def __init__(self, state_size, env=None, is_eval=False): self.state_size = state_size self.action_size = 3 self.memory_size = 1000 self._memory = deque(maxlen=1000) self.inventory = pd.DataFrame(columns=['Price', 'POS', 'Order']) self.is_eval = is_eval self.learning_rate = env.learning_rate self.gamma = env.gamma self.env = env self.up = dict(batch_size = self.env.batch_size, frequency = self.env.batch_size) VPGAgent.__init__(self, states=dict(type='float', shape=self.state_size.shape), actions=dict(type='int', num_actions=self.action_size), network=self.get_network(), update_mode=self.up, batching_capacity=self.memory_size, learning_rate=self.learning_rate, discount=self.gamma) self._load_model()
def test_gae(self): config = Configuration(discount=0.75, batch_size=8, learning_rate=0.001, gae_rewards=True, gae_lambda=0.5, states=dict(shape=(1, )), actions=dict(continuous=True), network=layered_network_builder(())) agent = VPGAgent(config=config) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) agent.model.baseline = dict(state=Baseline()) agent.model.baseline['state'].predict = lambda states: baseline td_residuals = np.array([ 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.75 * 0.25, 0.75 * 0.5 - 0.25, 1.0, 1.0 + 0.75 * 0.25 - 0.5, 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.0 ]) result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) expected = np.array([ np.sum( ((0.5 * 0.75)**np.array([0, 1, 2, 3, 4])) * td_residuals[:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[1:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[2:5]), np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[3:5]), np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[4:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[5:]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[6:]), np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[7:]), np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[8:]) ]) self.assertTrue((result == expected).all())
for i in range(int(len(infrastructure.peers) * .1)) }, network=[ dict(type='flatten'), dict(type='dense', size=32, activation='relu'), ], ) # Create a Vanilla Policy Gradient agent_vpg = VPGAgent( states={ "type": 'float', "shape": infrastructure.get_state().shape }, actions={ str(i): dict(type="int", num_actions=len(infrastructure.peers)) for i in range(int(len(infrastructure.peers) * .1)) }, network=[ dict(type='flatten'), dict(type='dense', size=32, activation='relu'), ], ) #agent_ppo.restore_model("results/client-server") print("agents made") monkey = [] rl_ppo = [] rl_dqn = [] rl_vpg = []
# DistributionModel distributions_spec=None, entropy_regularization=0.01, # PGModel baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, # PGLRModel likelihood_ratio_clipping=0.2, summary_spec=None, distributed_spec=None )''' agent = VPGAgent(states_spec=env.states, actions_spec=env.actions, network_spec=network_spec, batch_size=10) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print( "Finished episode {ep} after {ts} timesteps (reward: {reward})".format( ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) return True # Start learning
def main(): parser = argparse.ArgumentParser() parser.add_argument('-r', '--relation', help="Number of episodes") parser.add_argument('-e', '--episodes', type=int, default=500, help="Number of episodes") parser.add_argument('-a', '--agent', type=str, default='vpg', help="VPG or DQN Agent") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() print("Running DeepPath-TensorForce") if args.relation: # relation is defined relation = args.relation logger.info('Relation set to %s', relation) else: logger.error("Error : No Relation name provided!") return graphPath = dataPath + 'tasks/' + relation + '/' + 'graph.txt' relationPath = dataPath + 'tasks/' + relation + '/' + 'train_pos' if not os.path.exists(relationPath): logger.info('Incorrect relation specified %s', relation) print('Incorrect relation specified ', relation) f = open(relationPath) data = f.readlines() f.close() # Initialize the DeePath Environment class environment = DPEnv(graphPath, relationPath, task=data) network_spec = [ dict(type='dense', size=512, activation='relu'), dict(type='dense', size=1024, activation='relu') ] step_optimizer = dict(type='adam', learning_rate=1e-3) agent = None if args.agent == 'vpg': logger.info('Initializing VPGAgent') agent = VPGAgent(states_spec=dict(shape=state_dim, type='float'), actions_spec=dict(num_actions=action_space, type='int'), network_spec=network_spec, optimizer=step_optimizer, discount=0.99, batch_size=1000) elif args.agent == 'dqn': logger.info('Initializing DQNAgent') agent = DQNAgent(states_spec=dict(shape=state_dim, type='float'), actions_spec=dict(num_actions=action_space, type='int'), network_spec=network_spec, optimizer=step_optimizer, discount=0.99, batch_size=1000) logger.info('Initializing Runner') runner = Runner(agent=agent, environment=environment) report_episodes = args.episodes / 50 # default episodes = 500 def episode_finished(r): if r.episode % report_episodes == 0: logger.info( "Finished episode {ep} after {ts} timesteps. Steps Per Second " .format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 50 rewards: {}".format( sum(r.episode_rewards[-50:]) / 50)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) print("Starting {agent} for Environment".format(agent=agent)) runner.run(episodes=args.episodes, max_episode_timesteps=1, episode_finished=episode_finished) logger.info( "Learning finished. Total episodes: {ep}".format(ep=runner.episode)) print("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) environment.close()
if timestep % save_step == 0: plt.savefig('data/%s_%f_%d_%d_%d.jpg' % (model_name, learning_rate, MAXSTEPS, LAYER_1, LAYER_2)) plt.show() if __name__ == '__main__': env = OpenAIGym('Acrobot-v1', visualize=False) LAYER_1 = 128 LAYER_2 = 64 observation = env.reset() VPG_agent = VPGAgent( states=dict(type='float', shape=env.states['shape']), actions=dict(type='int', num_actions=env.actions['num_actions']), # discrete action space but continuous state space network=[ dict(type='dense', size=LAYER_1, activation='relu'), # changed to tanh for best dict(type='dense', size=LAYER_2, activation='relu') ], optimizer=dict(type='adam', learning_rate=learning_rate)) # Create a Proximal Policy Optimization agent PPO_agent = PPOAgent( states=dict(type='float', shape=env.states['shape']), actions=dict(type='int', num_actions=env.actions['num_actions']), # discrete action space but continuous state space network=[ dict(type='dense', size=LAYER_1, activation='relu'), #changed to tanh for best dict(type='dense', size=LAYER_2, activation='relu') ],
self.gym = gym self.visualize = False env = ConcatStates(env) environment = TFOpenAIGymCust('CryptoPortfolioEIIE-v0', env) env.seed(0) state = environment.reset() state, done, reward = environment.execute(env.action_space.sample()) network_spec = [dict(type='dense', size=16), dict(type='dense', size=10)] agent = VPGAgent(states_spec=environment.states, actions_spec=environment.actions, batch_size=20, network_spec=network_spec, discount=0.8, optimizer=dict(type='adam', learning_rate=1e-4)) runner = Runner(agent=agent, environment=environment, repeat_actions=1) report_episodes = 100 print("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) pvs = [] def episode_finished(r): if r.episode % report_episodes == 0:
def get_agent(game, agentType): count = 1 base_path = '.' checkpointPath = base_path + "/games/agents/" + game + "/" + agentType + "/" if agentType == "vpg": agent = VPGAgent( states=config[game]["states"], actions=config[game]["actions"], memory=1000, network="auto", ) elif agentType == "ppo": agent = PPOAgent( states=config[game]["states"], actions=config[game]["actions"], memory=1000, network="auto", ) elif agentType == "dqn": agent = DQNAgent( states=config[game]["states"], actions=config[game]["actions"], memory=1000, network="auto", ) if game == "3pd": try: agent.restore(directory=checkpointPath, filename=None) print("restoration successful") except Exception as e: agent.initialize() for x in tqdm(range(1000001)): testState = np.full(config[game]["states"]["shape"], None) for i in range(10): moveA = agent.act(testState) moveB = agent.act(testState) moveC = agent.act(testState) rewards = payoffs(game, [moveA, moveB, moveC]) if i < 9: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=False) agent.observe(reward=rewards[2], terminal=False) else: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=False) agent.observe(reward=rewards[2], terminal=True) testState[i] = [[moveA], [moveB], [moveC]] if x % 1000 == 0: # checkpointPath = "../games/agents/" + game + "/" + agentType + "/" agent.save(directory=checkpointPath, filename=None) # print("saving successful") else: try: agent.restore(directory=checkpointPath, filename=None) print("restoration successful") except Exception as e: # try: # checkpointPath = base_path + "/agents/" + game + "/" + agentType + "/" # agent.restore(directory=checkpointPath, filename=None) # print("restoration successful after second attempt") # except Exception as e: # a = subprocess.check_output("ls games/", shell=True) # print(a) # print(os.getcwd(), "vs", subprocess.check_output("pwd", shell=True)) # checkpointPath = "./games/agents/" + game + "/" + agentType + "/" # print(checkpointPath) # agent.restore(directory=checkpointPath, filename=None) # print("restoration successful after third attempt") agent.initialize() for x in tqdm(range(count)): testState = np.full(config[game]["states"]["shape"], 0) for i in range(10): moveA = agent.act(testState) moveB = agent.act(testState) rewards = payoffs(game, [moveA, moveB]) if i < 10: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=False) else: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=True) testState[i] = [[moveA], [moveB]] checkpointPath = "./games/agents/" + game + "/" + agentType + "/" agent.save(directory=checkpointPath, filename=None) print("saving successful") return agent
network=[dict(type='dense', size=64), dict(type='dense', size=64)], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) # Create a Trust Region Policy Optimization agent agentTRPO = TRPOAgent( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=[dict(type='dense', size=64), dict(type='dense', size=64)]) # Create a Vanilla Policy Gradient agent agentVPG = VPGAgent( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=[dict(type='dense', size=64), dict(type='dense', size=64)]) # Add 3 random agents agents = [] for agent_id in range(3): agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"]))) # Add TensorforceAgent agent_id += 1 agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None)
"down": dict(type="float", min_value=0.0, max_value=1.0), "left": dict(type="float", min_value=0.0, max_value=1.0), "right": dict(type="float", min_value=0.0, max_value=1.0), }, network='auto', memory=10000, ) elif args.agent == "vpg": # Vanilla Policy Gradient agent = VPGAgent( states={ "type": 'float', "shape": (1, 610) }, actions={ "up": dict(type="float", min_value=0.0, max_value=1.0), "down": dict(type="float", min_value=0.0, max_value=1.0), "left": dict(type="float", min_value=0.0, max_value=1.0), "right": dict(type="float", min_value=0.0, max_value=1.0), }, network='auto', memory=10000, ) else: print("Available agents: vpg, ppo, dqn") exit() print("agent ready", agent) agent.initialize() # Set up base of agent try: # Looks to see if a saved model is available and loads it
network=[ dict(type='flatten'), dict(type="dense", size=32), ], memory=10000, ) elif args.agent == "vpg": agent = VPGAgent( states={ "type": 'float', "shape": G.graph.shape }, actions={ "user": dict(type="int", num_values=G.graph.shape[0]), "item": dict(type="int", num_values=G.graph.shape[1]) }, network=[ dict(type='flatten'), dict(type="dense", size=32), ], memory=10000, ) elif args.agent == "trpo": agent = TRPOAgent( states={ "type": 'float', "shape": G.graph.shape }, actions={ "user": dict(type="int", num_values=G.graph.shape[0]),
str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(int(infrastructure.servers * .1)) }, network=[ dict(type='flatten'), dict(type="dense", size=32), dict(type="dense", size=32), dict(type="dense", size=32) ], ) elif args.monkey == "vpg": monkey = VPGAgent( states={"type":'float', "shape": infrastructure.graph.shape }, actions={ str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(int(infrastructure.servers * .1)) }, network=[ dict(type='flatten'), dict(type="dense", size=32), dict(type="dense", size=32), dict(type="dense", size=32) ], ) if args.manager == "ppo": manager = PPOAgent( states={"type":'float', "shape": infrastructure.graph.shape }, actions={ str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(infrastructure.clients) }, network=[ dict(type='flatten'), dict(type="dense", size=32),