def test_dqn_agent(self): config = { 'seed': 10, 'batch_size': 16, 'state_shape': (2, ), 'actions': 2, 'action_shape': (), 'update_rate': 1, 'update_repeat': 4, 'min_replay_size': 50, 'memory_capacity': 50, "exploration": "epsilon_decay", "exploration_param": { "epsilon": 1, "epsilon_final": 0, "epsilon_states": 50 }, 'target_network_update_rate': 1.0, 'use_target_network': True, "alpha": 0.0005, "gamma": 0.99, "tau": 1.0 } tf.reset_default_graph() tf.set_random_seed(10) config = create_config(config) network_builder = NeuralNetwork.layered_network( layers=[{ 'type': 'dense', 'num_outputs': 16 }, { 'type': 'linear', 'num_outputs': 2 }]) agent = DQNAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in xrange(10000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = False agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return assert (sum(rewards) == 100.0)
def main(): env = OpenAIGym("P3DX-v0") agent = DQNAgent(states=dict(type='float', shape=(80, 80, 4)), actions=dict(type='int', num_actions=7), network=[ dict(type="conv2d", size=16, window=[8, 8], stride=4, activation="relu"), dict(type="conv2d", size=32, window=[4, 4], stride=2, activation="relu"), dict(type="flatten"), dict(type="dense", size=256) ], actions_exploration=dict(type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.1, timesteps=1000), memory=dict(type="replay", capacity=1000, include_next_states=True), update_mode=dict(unit="timesteps", batch_size=16, frequency=4), discount=0.99, entropy_regularization=None, double_q_model=True, optimizer=dict(type="adam", learning_rate=1e-4)) try: agent.restore_model(directory="modelo/", file="data-129235") print("Found data!") except Exception as e: print(e) print("Can't load data") print("Starting execution") state = env.reset() agent.reset() try: while True: # Get action - no exploration and no observing action = agent.act(state, deterministic=True, independent=True) print(action) # Execute action in the environment state, terminal_state, reward = env.execute(action) if terminal_state: raise KeyboardInterrupt except KeyboardInterrupt: print("Terminal state", terminal_state) state = env.reset() agent.reset()
def test_lstm(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32), dict(type='lstm') ])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN agent (LSTM): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN agent (LSTM) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[False, (False, 2)]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=network_builder) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-15:], r.episode_lengths[-15:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_replay(self): environment = MinimalTest(definition=[(False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory=dict(type='replay', random_sampling=True), first_update=20, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Replay memory DQN: ' + str(runner.episode))
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=5000, episode_finished=episode_finished) print('DQN Agent: ' + str(runner.episode)) if runner.episode < 5000: passed += 1 print('passed') else: print('failed') print('DQN Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def initialize(self, env): from gym import spaces from tensorforce.agents import DQNAgent if self.algorithm == "dqn": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_actions': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_actions=env.action_space.n) return DQNAgent(states=dict(type='float', shape=env.observation_space.shape), actions=actions, network=[ dict(type='dense', size=128), dict(type='dense', size=128) ], batching_capacity=100) return None
def main(): gym_id = 'CartPole-v0' max_episodes = 10000 max_timesteps = 1000 env = OpenAIGym(gym_id) network_spec = [ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ] agent = DQNAgent( states_spec=env.states, actions_spec=env.actions, network_spec=network_spec, batch_size=64 ) runner = Runner(agent, env) report_episodes = 10 def episode_finished(r): if r.episode % report_episodes == 0: logging.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logging.info("Episode reward: {}".format(r.episode_rewards[-1])) logging.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(max_episodes, max_timesteps, episode_finished=episode_finished) print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
def test_introduction_dqnagent(self): from tensorforce import Configuration from tensorforce.agents import DQNAgent from tensorforce.core.networks import layered_network_builder # Define a network builder from an ordered list of layers layers = [dict(type='dense', size=32), dict(type='dense', size=32)] network = layered_network_builder(layers_config=layers) # Define a state states = dict(shape=(10, ), type='float') # Define an action (models internally assert whether # they support continuous and/or discrete control) actions = dict(continuous=False, num_actions=5) # The agent is configured with a single configuration object agent_config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=states, actions=actions, network=network) agent = DQNAgent(config=agent_config)
def load_agent(config, env, network_spec): if isfile(join(config.agent_dir, config.agent_name + ".json")): return TensorforceAgent.load(config.agent_dir, config.agent_name, "checkpoint", env) return DQNAgent(states=env.states(), actions=env.actions(), network=network_spec, **config.agent_specs)
def get_dqn_agent(): return DQNAgent( states=dict(type='float', shape=(5, )), actions=dict(type='int', num_actions=2), network=[dict(type='dense', size=20), dict(type='dense', size=20)], batched_observe=False, )
def get_agent(agentType): if agentType == "dqn": agent = DQNAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, memory=1000, network="auto", ) elif agentType == "vpg": agent = VPGAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, network="auto", memory=1000, ) elif agentType == "trpo": agent = TRPOAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, network="auto", memory=1000, ) return agent
def overwrite_agent(env, network_spec, config): onlyfiles_agent = [ f for f in listdir(config.agent_dir) if isfile(join(config.agent_dir, f)) and f.startswith(config.agent_name) ] for f in onlyfiles_agent: remove(join(config.agent_dir, f)) return DQNAgent(states=env.states(), actions=env.actions(), network=network_spec, **config.agent_specs)
def __init__(self, state_size, env=None, is_eval=False): self.state_size = state_size self.action_size = 3 self._memory_size = 1000 self._memory = deque(maxlen=1000) self.is_eval = is_eval self.env = env DQNAgent.__init__(self, states = dict(type='float', shape=self.state_size.shape), actions = dict(type='int', num_actions=self.action_size), network = env.get_network(), discount = env.hyperparameters['gamma'], batching_capacity = 10000, double_q_model = True, actions_exploration = env.exploration) self._load_model()
def test_trpo_agent(self): config = { 'batch_size': 8, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2, ), 'actions': 2, 'action_shape': () } tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork.layered_network( layers=[{ 'type': 'dense', 'num_outputs': 32 }, { 'type': 'linear', 'num_outputs': 2 }]) agent = DQNAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in range(1000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return
def agent(self): nb_actions = self.env.action_space.n obs_dim = len(self.env.observation_space.spaces) obs_dim_2 = self.env.observation_space.spaces[0].shape return DQNAgent(states=dict(type='float32', shape=(obs_dim, obs_dim_2)), actions=dict(type='int', num_actions=nb_actions), network=[ dict(type=Permute, dims=(0, 2, 3, 1)), dict(type='conv2d', size=5, window=(obs_dim, 1), stride=(obs_dim, 1), padding='SAME', l2_regularization=1e-4), dict(type=BatchNormalization), dict(type=Permute, dims=(0, 3, 2, 1)), dict(type='conv2d', size=5, window=(5, 1), stride=(5, 1), padding='SAME', l2_regularization=1e-4), dict(type=BatchNormalization), dict(type=Permute, dims=(0, 3, 2, 1)), dict(type='conv2d', size=5, window=(5, 1), stride=(5, 1), padding='SAME', l2_regularization=1e-4), dict(type=BatchNormalization), dict(type=Permute, dims=(0, 3, 2, 1)), dict(type='conv2d', size=5, window=(5, 1), stride=(5, 1), padding='SAME', l2_regularization=1e-4), dict(type=BatchNormalization), dict(type='flatten'), dict(type='dense', size=nb_actions, activation='softmax') ], states_preprocessing=[dict(type=Conv2DPreprocessor)])
def test_multi(self): """ This is relatively unstable and highly depends on initialisation - either passes quickly or fails no matter what. """ passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32, scope='state0-1'), size=32, scope='state0-2') state1 = layer(x=layer(x=inputs['state1'], size=32, scope='state1-1'), size=32, scope='state1-2') return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[False, (False, 2)]) config = Configuration( batch_size=8, learning_rate=0.0001, memory_capacity=800, first_update=80, target_update_frequency=20, repeat_update=4, memory=dict( type='prioritized_replay', ), states=environment.states, actions=environment.actions, network=network_builder ) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-15:], r.episode_lengths[-15:])) runner.run(episodes=2000, episode_finished=episode_finished) print('DQN agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('DQN agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 3)
def create_agent(memory, double_model, environment): # create the agent agent = DQNAgent(states=environment.states, actions=environment.actions, network=network_spec, double_q_model=double_model, memory=memory, update_mode=None, optimizer=dict( type='adam', learning_rate=1e-3 ), states_preprocessing=[dict(type='image_resize', width=128, height=128), dict(type='grayscale'), dict(type='divide',scale=255)], target_sync_frequency=1000, # Comment in to test exploration types actions_exploration=dict( type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.05, timesteps=250000 ) ) return agent
def create_agent(memory, double_model, environment): # create the agent agent = DQNAgent( states=environment.states, actions=environment.actions, network=network_spec, double_q_model=double_model, memory=memory, update_mode=None, optimizer=dict(type='adam', learning_rate=1e-3), states_preprocessing=[ dict(type='running_standardize'), dict(type='sequence') ], target_sync_frequency=1000, # Comment in to test exploration types actions_exploration=dict(type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.1, timesteps=3500000)) return agent
def test_prioritized_replay(self): environment = MinimalTest(definition=[(False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory='prioritized_replay', first_update=20, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Prioritized replay memory DQN: ' + str(runner.episode))
def __init__(self, states, actions): self.q_agent = DQNAgent( states, actions, network=[ dict(type='conv2d', size=3), dict(type='flatten'), dict(type='dense', size=64), dict(type='dense', size=64) ], update_mode=dict(unit='timesteps', batch_size=8, frequency=8), memory=dict(type='replay', include_next_states=True, capacity=200), optimizer=dict(type='adam', learning_rate=1e-2), states_preprocessing=[ dict(type='normalize'), #dict(type='running_standardize'), #dict(type='sequence') ], target_sync_frequency=10, actions_exploration=dict(type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.1, timesteps=10))
#--------------------------------------------------------------------- from tensorforce.agents import DQNAgent # Network is an ordered list of layers. network_spec = [ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ] # Define a state. states = dict(shape=(10, ), type='float') #states = dict( # image=dict(shape=(64, 64, 3), type='float'), # caption=dict(shape=(20,), type='int') #) # Define an action. actions = dict(type='int', num_actions=5) # The agent is configured with a single configuration object. config = dict(memory=dict(type='replay', capacity=1000), batch_size=8, first_update=100, target_sync_frequency=10) agent = DQNAgent(states_spec=states, actions_spec=actions, network_spec=network_spec, **config)
baseline=dict( type="cnn", conv_sizes=[32, 32], dense_sizes=[32] ), baseline_optimizer=dict( type="multi_step", optimizer=dict( type="adam", learning_rate=1e-3 ), num_steps=5 ) ) agent = DQNAgent(**dqn) #agent = PPOAgent(**ppo) statistics = {} actions = [0 for x in range(16)] s = time.time() skip_steps = 8 g.flip_player() for i in range(100000): state = g.reset() while not g.is_terminal(): state = cv2.resize(state, (80, 80)) # Perform Action action = agent.act(state) actions[action] += 1 _, r, t, _ = g.step(action)
def test_blogpost_introduction_runner(self): from tensorforce.environments.minimal_test import MinimalTest from tensorforce.agents import DQNAgent from tensorforce.execution import Runner environment = MinimalTest(specification=[('int', ())]) network_spec = [ dict(type='dense', size=32) ] agent = DQNAgent( states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, first_update=100, target_sync_frequency=50 ) runner = Runner(agent=agent, environment=environment) def episode_finished(runner): if runner.episode % 100 == 0: print(sum(runner.episode_rewards[-100:]) / 100) return runner.episode < 100 \ or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:]) # runner.run(episodes=1000, episode_finished=episode_finished) runner.run(episodes=10, episode_finished=episode_finished) # Only 10 episodes for this test ### Code block: next agent = DQNAgent( states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, first_update=100, target_sync_frequency=50 ) # max_episodes = 1000 max_episodes = 10 # Only 10 episodes for this test max_timesteps = 2000 episode = 0 episode_rewards = list() while True: state = environment.reset() agent.reset() timestep = 0 episode_reward = 0 while True: action = agent.act(states=state) state, terminal, reward = environment.execute(actions=action) agent.observe(terminal=terminal, reward=reward) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: break episode += 1 episode_rewards.append(episode_reward) if all(reward >= 1.0 for reward in episode_rewards[-100:]) or episode == max_episodes: break agent.close() environment.close()
def test_blogpost_introduction(self): """ Test of introduction blog post examples. """ import tensorflow as tf import numpy as np ### DQN agent example from tensorforce.agents import DQNAgent # Network is an ordered list of layers network_spec = [dict(type='dense', size=32), dict(type='dense', size=32)] # Define a state states = dict(shape=(10,), type='float') # Define an action actions = dict(type='int', num_actions=5) agent = DQNAgent( states_spec=states, actions_spec=actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, first_update=100, target_sync_frequency=10 ) agent.close() ### Code block: multiple states states = dict( image=dict(shape=(64, 64, 3), type='float'), caption=dict(shape=(20,), type='int') ) # DQN does not support multiple states. Omit test for now. # agent = DQNAgent(config=config) ### Code block: DQN observer function def observe(self, reward, terminal): super(DQNAgent, self).observe(reward, terminal) if self.timestep >= self.first_update \ and self.timestep % self.target_update_frequency == 0: self.model.update_target() ### Code block: Network config JSON network_json = """ [ { "type": "conv2d", "size": 32, "window": 8, "stride": 4 }, { "type": "conv2d", "size": 64, "window": 4, "stride": 2 }, { "type": "flatten" }, { "type": "dense", "size": 512 } ] """ ### Test json import json network_spec = json.loads(network_json) ### Code block: Modified dense layer modified_dense = """ [ { "type": "dense", "size": 64, "bias": false, "activation": "selu", "l2_regularization": 0.001 } ] """ ### Test json network_spec = json.loads(modified_dense) ### Code block: Own layer type from tensorforce.core.networks import Layer class BatchNormalization(Layer): def __init__(self, variance_epsilon=1e-6, scope='batchnorm', summary_labels=None): super(BatchNormalization, self).__init__(scope=scope, summary_labels=summary_labels) self.variance_epsilon = variance_epsilon def tf_apply(self, x, update): mean, variance = tf.nn.moments(x, axes=tuple(range(x.shape.ndims - 1))) return tf.nn.batch_normalization( x=x, mean=mean, variance=variance, offset=None, scale=None, variance_epsilon=self.variance_epsilon ) ### Test own layer states = dict(shape=(10,), type='float') network_spec = [ {'type': 'dense', 'size': 32}, {'type': BatchNormalization, 'variance_epsilon': 1e-9} ] agent = DQNAgent( states_spec=states, actions_spec=actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8 ) agent.close() ### Code block: Own network builder from tensorforce.core.networks import Network class CustomNetwork(Network): def tf_apply(self, x, internals, update, return_internals=False): image = x['image'] # 64x64x3-dim, float caption = x['caption'] # 20-dim, int initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01, dtype=tf.float32) # CNN weights = tf.get_variable(name='W1', shape=(3, 3, 3, 16), initializer=initializer) image = tf.nn.conv2d(image, filter=weights, strides=(1, 1, 1, 1), padding='SAME') image = tf.nn.relu(image) image = tf.nn.max_pool(image, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='SAME') weights = tf.get_variable(name='W2', shape=(3, 3, 16, 32), initializer=initializer) image = tf.nn.conv2d(image, filter=weights, strides=(1, 1, 1, 1), padding='SAME') image = tf.nn.relu(image) image = tf.nn.max_pool(image, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='SAME') image = tf.reshape(image, shape=(-1, 16 * 16, 32)) image = tf.reduce_mean(image, axis=1) # LSTM weights = tf.get_variable(name='W3', shape=(30, 32), initializer=initializer) caption = tf.nn.embedding_lookup(params=weights, ids=caption) lstm = tf.contrib.rnn.LSTMCell(num_units=32) caption, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=caption, dtype=tf.float32) caption = tf.reduce_mean(caption, axis=1) # Combination if return_internals: return tf.multiply(image, caption), list() else: return tf.multiply(image, caption) ### Test own network builder states = dict( image=dict(shape=(64, 64, 3), type='float'), caption=dict(shape=(20,), type='int') ) agent = DQNAgent( states_spec=states, actions_spec=actions, network_spec=CustomNetwork, memory=dict( type='replay', capacity=1000 ), batch_size=8 ) agent.close() ### Code block: LSTM function from tensorforce.core.networks import Layer class Lstm(Layer): def __init__(self, size, scope='lstm', summary_labels=()): self.size = size super(Lstm, self).__init__(num_internals=1, scope=scope, summary_labels=summary_labels) def tf_apply(self, x, update, state): state = tf.contrib.rnn.LSTMStateTuple(c=state[:, 0, :], h=state[:, 1, :]) self.lstm_cell = tf.contrib.rnn.LSTMCell(num_units=self.size) x, state = self.lstm_cell(inputs=x, state=state) internal_output = tf.stack(values=(state.c, state.h), axis=1) return x, (internal_output,) def internal_inputs(self): return super(Lstm, self).internal_inputs() + [tf.placeholder(dtype=tf.float32, shape=(None, 2, self.size))] def internal_inits(self): return super(Lstm, self).internal_inits() + [np.zeros(shape=(2, self.size))] ### Test LSTM states = dict(shape=(10,), type='float') network_spec = [ {'type': 'flatten'}, {'type': Lstm, 'size': 10} ] agent = DQNAgent( states_spec=states, actions_spec=actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8 ) agent.close() ### Preprocessing configuration states = dict(shape=(84, 84, 3), type='float') preprocessing = [ dict( type='image_resize', width=84, height=84 ), dict( type='grayscale' ), dict( type='normalize' ), dict( type='sequence', length=4 ) ] ### Test preprocessing configuration agent = DQNAgent( states_spec=states, actions_spec=actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, first_update=100, target_sync_frequency=50, preprocessing=preprocessing ) agent.close() ### Code block: Continuous action exploration exploration = dict( type='ornstein_uhlenbeck', sigma=0.1, mu=0, theta=0.1 ) ### Test continuous action exploration agent = DQNAgent( states_spec=states, actions_spec=actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, exploration=exploration ) agent.close() ### Code block: Discrete action exploration exploration = dict( type='epsilon_decay', initial_epsilon=1.0, final_epsilon=0.01, timesteps=1e6 ) ### Test discrete action exploration agent = DQNAgent( states_spec=states, actions_spec=actions, network_spec=network_spec, memory=dict( type='replay', capacity=1000 ), batch_size=8, exploration=exploration ) agent.close()
def get_agent(game, agentType): count = 1 base_path = '.' checkpointPath = base_path + "/games/agents/" + game + "/" + agentType + "/" if agentType == "vpg": agent = VPGAgent( states=config[game]["states"], actions=config[game]["actions"], memory=1000, network="auto", ) elif agentType == "ppo": agent = PPOAgent( states=config[game]["states"], actions=config[game]["actions"], memory=1000, network="auto", ) elif agentType == "dqn": agent = DQNAgent( states=config[game]["states"], actions=config[game]["actions"], memory=1000, network="auto", ) if game == "3pd": try: agent.restore(directory=checkpointPath, filename=None) print("restoration successful") except Exception as e: agent.initialize() for x in tqdm(range(1000001)): testState = np.full(config[game]["states"]["shape"], None) for i in range(10): moveA = agent.act(testState) moveB = agent.act(testState) moveC = agent.act(testState) rewards = payoffs(game, [moveA, moveB, moveC]) if i < 9: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=False) agent.observe(reward=rewards[2], terminal=False) else: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=False) agent.observe(reward=rewards[2], terminal=True) testState[i] = [[moveA], [moveB], [moveC]] if x % 1000 == 0: # checkpointPath = "../games/agents/" + game + "/" + agentType + "/" agent.save(directory=checkpointPath, filename=None) # print("saving successful") else: try: agent.restore(directory=checkpointPath, filename=None) print("restoration successful") except Exception as e: # try: # checkpointPath = base_path + "/agents/" + game + "/" + agentType + "/" # agent.restore(directory=checkpointPath, filename=None) # print("restoration successful after second attempt") # except Exception as e: # a = subprocess.check_output("ls games/", shell=True) # print(a) # print(os.getcwd(), "vs", subprocess.check_output("pwd", shell=True)) # checkpointPath = "./games/agents/" + game + "/" + agentType + "/" # print(checkpointPath) # agent.restore(directory=checkpointPath, filename=None) # print("restoration successful after third attempt") agent.initialize() for x in tqdm(range(count)): testState = np.full(config[game]["states"]["shape"], 0) for i in range(10): moveA = agent.act(testState) moveB = agent.act(testState) rewards = payoffs(game, [moveA, moveB]) if i < 10: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=False) else: agent.observe(reward=rewards[0], terminal=False) agent.observe(reward=rewards[1], terminal=True) testState[i] = [[moveA], [moveB]] checkpointPath = "./games/agents/" + game + "/" + agentType + "/" agent.save(directory=checkpointPath, filename=None) print("saving successful") return agent
def initialize(self, env): from gym import spaces from tensorforce.agents import PPOAgent from tensorforce.agents import DQNAgent if self.algorithm == "ppo": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_actions': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_actions=env.action_space.n) return PPOAgent( #states=dict(type='float', shape=env.observation_space.shape), states=dict(type='float', shape=[11, 11, 18]), actions=actions, #network=[ # dict(type='dense', size=64), # dict(type='dense', size=64) #], network=[ dict(type='conv2d', size=32), dict(type='conv2d', size=32), dict(type='conv2d', size=32), dict(type='conv2d', size=32), dict(type='flatten') ], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) #return PPOAgent( # states=dict(type='float', shape=env.observation_space.shape), # actions=actions, # network=[ # dict(type='dense', size=64), # dict(type='dense', size=64) # ], # batching_capacity=1000, # step_optimizer=dict(type='adam', learning_rate=1e-4)) elif self.algorithm == "dqn": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_actions': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_actions=env.action_space.n) return DQNAgent( states=dict(type='float', shape=[11, 11, 18]), actions=actions, discount=0.9, double_q_model=False, network=[ dict(type='conv2d', size=32), dict(type='conv2d', size=32), dict(type='conv2d', size=32), dict(type='conv2d', size=32), dict(type='flatten') ], batching_capacity=1000, optimizer=dict(type='adam', learning_rate=1e-4)) return None
def main( mode, # 'train' or 'test' episode=2000, window_size=30, # agent 브레인이 참고할 이전 타임스텝의 길이 init_invest=20000, model_path=None, addition_train=False, selected_learn='dqn', # 'dqn' or 'ppo' selected_trading=[], selected_subject=[], ui_windows=None, # 현재 띄워진 Ui객체 ): global gl_ui_window gl_ui_window = ui_windows set_model_path(model_path if not model_path is None else os.path. join(os.getcwd(), 'model')) if not 'model' in os.listdir(os.getcwd()): os.makedirs('model') # create environment for train and test DATA_PATH = '../daily_data' environment = create_gold_env(window_size=window_size, path=DATA_PATH, train=True if mode == 'train' else False, selected_trading=selected_trading, selected_subject=selected_subject, init_invest=init_invest) network_spec = create_network_spec() baseline_spec = create_baseline_spec() if selected_learn == 'ppo': agent = PPOAgent( discount=0.9999, states=environment.states, actions=environment.actions, network=network_spec, # Agent states_preprocessing=None, actions_exploration=None, reward_preprocessing=None, # MemoryModel update_mode=dict( unit='timesteps', #'episodes', # 10 episodes per update batch_size=32, # # Every 10 episodes frequency=10), memory=dict(type='latest', include_next_states=False, capacity=50000), # DistributionModel distributions=None, entropy_regularization=0.0, # None # PGModel baseline_mode='states', baseline=dict(type='custom', network=baseline_spec), baseline_optimizer=dict( type='multi_step', optimizer=dict( type='adam', learning_rate=(1e-4) # 3e-4 ), num_steps=5), gae_lambda=0, # 0 # PGLRModel likelihood_ratio_clipping=0.2, # PPOAgent step_optimizer=dict( type='adam', learning_rate=(1e-4) # 1e-4 ), subsampling_fraction=0.2, # 0.1 optimization_steps=10, execution=dict(type='single', session_config=None, distributed_spec=None)) else: # learn_model=='dqn' or etc. agent = DQNAgent( states=environment.states, actions=environment.actions, network=[ dict(type='flatten'), dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu'), ], ) if mode == 'test' or addition_train == True: if len( [elem for elem in os.listdir(LOAD_DIR) if 'trading_model' in elem ]) >= 3: agent.restore_model(LOAD_DIR) print('loaded') elif mode == 'test': ui_windows.setInfo(msg="로딩할 트레이딩모델이 존재하지 않는 것으로 보입니다.") return runner = Runner(agent=agent, environment=environment) if mode == 'train': kwargs = dict(episodes=episode, max_episode_timesteps=16000, episode_finished=episode_finished) else: # mode=='test' kwargs = dict(num_episodes=episode, deterministic=True, testing=True, episode_finished=print_simple_log) runner.run(**kwargs) # TODO TFTraderEnv에 에피소드마다의 포트폴리오 결과치 저장해야함. UI에 매순간 데이터 설정하기. # setResult(????) msg = "{mode} finished. Total episodes: {ep}. \nAverage reward of last 100 episodes: {ar}.".format( mode="Training" if mode == 'train' else "Testing", ep=runner.episode, ar=np.mean(runner.episode_rewards[-100:])) print(msg) ui_windows.setInfo(msg=msg)
def main(max_timesteps, learning_rate): max_episodes = None #max_timesteps = 86400000000*days network_spec = [ #dict(type='flatten'), dict(type='dense', size=11, activation='tanh'), #dict(type='dense', size=20, activation='tanh'), #dict(type='dense', size=32, activation='tanh'), ] exploration = dict(type='epsilon_decay', timesteps=max_timesteps) summarizer = dict( directory="./models/" + str(datetime.now()).replace(' ', ''), steps=10000, seconds=None, labels=[ #'rewards', #'actions', 'inputs', 'gradients', 'configuration', ], meta_dict=dict( description='July 2: Trying 11 node hidden layer.', layers=str(network_spec), timesteps=max_timesteps, exploration=exploration, ), ) agent = DQNAgent(states=env.states, actions=env.actions, network=network_spec, actions_exploration=exploration, optimizer=dict(type='adam', learning_rate=learning_rate) #summarizer=summarizer, #batch_size=64 ) runner = Runner(agent, env) report_episodes = 1 #global prev global prev prev = 0 def episode_finished(r): global prev if r.episode % report_episodes == 0: #print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep-prev)) #print("Episode reward: {}".format(r.episode_rewards[-1])) print(r.episode_rewards[-1]) prev = r.timestep #print("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(num_episodes=max_episodes, num_timesteps=max_timesteps, max_episode_timesteps=None, episode_finished=episode_finished) agent.save_model(directory='./results/DeepQ/' + str(datetime.now()).replace(' ', '') + '/model') runner.close() print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
"up": dict(type="float", min_value=0.0, max_value=1.0), "down": dict(type="float", min_value=0.0, max_value=1.0), "left": dict(type="float", min_value=0.0, max_value=1.0), "right": dict(type="float", min_value=0.0, max_value=1.0), }, network='auto', memory=25000, ) elif args.agent == "dqn": # Deep Q-Learning agent = DQNAgent( states={ "type": 'float', "shape": (1, 613) }, actions={ "up": dict(type="float", min_value=0.0, max_value=1.0), "down": dict(type="float", min_value=0.0, max_value=1.0), "left": dict(type="float", min_value=0.0, max_value=1.0), "right": dict(type="float", min_value=0.0, max_value=1.0), }, network='auto', memory=10000, ) elif args.agent == "vpg": # Vanilla Policy Gradient agent = VPGAgent( states={ "type": 'float', "shape": (1, 610) }, actions={ "up": dict(type="float", min_value=0.0, max_value=1.0),