class TensorforceAgent: def __init__(self,actions): preprocessing_config = [ { "type": "grayscale" } ] exploration_config = dict( type="epsilon_anneal", initial_epsilon=0.25, final_epsilon=0.01, timesteps=1000000 ) network_spec = [ dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'), dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'), dict(type='flatten'), dict(type='dense', size=256, activation='lrelu') ] self.network_path = "network/" self.agent = PPOAgent( actions = dict(type='int', num_actions=len(actions)), states = dict(type='float', shape=(35, 150, 3)), network = network_spec, actions_exploration = exploration_config, states_preprocessing = preprocessing_config ) def act(self, obs): #Cut out only the part needed partly = np.delete(obs, np.s_[96:], 0) partly = np.delete(partly, np.s_[0:26], 0) partly = np.delete(partly, np.s_[35:45], 0) partly = np.delete(partly, np.s_[38:53], 0) partly = np.delete(partly, np.s_[31:35], 0) partly = np.delete(partly, np.s_[10:16], 0) frame = np.delete(partly, np.s_[150:], 1) #scipy.misc.imsave('outfile.jpg', frame) return self.agent.act(frame) def load(self): import os if os.path.isdir(self.network_path): try: self.agent.restore_model(self.network_path) except: print("Failed to load model") def observe(self, terminal = False, reward = 0): return self.agent.observe(terminal, reward) def save_model(self): import os if not os.path.isdir(self.network_path): os.makedirs(self.network_path) self.agent.save_model(self.network_path)
def main(): env = gym.make('CartPole-v0') # (4,) print(env.observation_space.shape) # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38] print(env.observation_space.high) # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] print(env.observation_space.low) # 2 print(env.action_space.n) agent = PPOAgent( states=dict(type='float', shape=env.observation_space.shape), network=[ dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu'), ], actions=dict(type='int', num_actions=env.action_space.n), step_optimizer=dict(type='adam', learning_rate=1e-4) ) model_dir = 'models/cartpole' if os.path.exists(f'{model_dir}/checkpoint'): agent.restore_model(directory=model_dir) try: for ep in range(2000): observation = env.reset() done = False ep_reward = 0 while not done: # env.render() states = observation / 4 action = agent.act(states=states) observation, reward, done, info = env.step(action) agent.observe(reward=reward, terminal=done) ep_reward += reward if done: print(f'ep = {ep}, ep_reward = {ep_reward}') except Exception as e: raise e finally: agent.save_model(directory=f'{model_dir}/agent')
class ForwardActor: def __init__(self): actions = {} for i in range(12): actions[str(i)] = {'type': 'float'} # 'num_actions': 10 network_spec = [ dict(type='dense', size=100, activation='relu'), dict(type='dense', size=100, activation='relu') ] self.agent = PPOAgent( states=dict(type='float', shape=(12, )), actions=actions, batching_capacity=2000, network=network_spec, step_optimizer=dict(type='adam', learning_rate=1e-4), ) def act(self, state): jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])), axis=0) jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0) #actiondict = self.agent.act( np.concatenate([jp,jv],axis=1)) actiondict = self.agent.act(jp) action = np.zeros(12) for i in range(12): action[i] = actiondict[str(i)][0] action = np.nan_to_num(action) #print(action) return np.clip(action, -1.0, 1.0) def observe(self, reward, terminal): self.agent.observe(reward=reward, terminal=terminal) def save(self, directory): self.agent.save_model(directory=directory) def restore(self, directory): self.agent.restore_model(directory=directory)
def test_readme(self): environment = UnittestEnvironment(states=dict(type='float', shape=(10, )), actions=dict(type='int', num_values=5)) def get_current_state(): return environment.reset() def execute_decision(x): return environment.execute(actions=x)[2] # Instantiate a Tensorforce agent agent = PPOAgent(states=dict(type='float', shape=(10, )), actions=dict(type='int', num_values=5), memory=10000, network='auto', update_mode=dict(unit='episodes', batch_size=10), step_optimizer=dict(type='adam', learning_rate=1e-4)) # Initialize the agent agent.initialize() # Retrieve the latest (observable) environment state state = get_current_state() # (float array of shape [10]) # Query the agent for its action decision action = agent.act(states=state) # (scalar between 0 and 4) # Execute the decision and retrieve the current performance score reward = execute_decision(action) # (any scalar float) # Pass feedback about performance (and termination) to the agent agent.observe(reward=reward, terminal=False) agent.close() environment.close() self.assertTrue(expr=True)
class PPOAgent(Agent): def __init__(self, name, game_inputs=None, callbacks=None, input_shape=None, input_type=None, use_tensorboard=True, tensorforce_kwargs=None): super().__init__(name, game_inputs=game_inputs, callbacks=callbacks) if input_shape is None or not isinstance(input_shape, tuple): raise SerpentError("'input_shape' should be a tuple...") if input_type is None or input_type not in ["bool", "int", "float"]: raise SerpentError( "'input_type' should be one of bool|int|float...") states_spec = {"type": input_type, "shape": input_shape} # TODO: Support multiple actions # TODO: Support continuous action spaces actions_spec = {"type": "int", "num_actions": len(self.game_inputs)} summary_spec = None if use_tensorboard: summary_spec = { "directory": "./tensorboard/", "steps": 50, "labels": [ "configuration", "gradients_scalar", "regularization", "inputs", "losses", "variables" ] } default_network_spec = [{ "type": "conv2d", "size": 32, "window": 8, "stride": 4 }, { "type": "conv2d", "size": 64, "window": 4, "stride": 2 }, { "type": "conv2d", "size": 64, "window": 3, "stride": 1 }, { "type": "flatten" }, { "type": "dense", "size": 1024 }] agent_kwargs = dict(batch_size=1024, batched_observe=1024, network_spec=default_network_spec, device=None, session_config=None, saver_spec=None, distributed_spec=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=0.01, keep_last_timestep=True, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, likelihood_ratio_clipping=None, step_optimizer=None, optimization_steps=10) if isinstance(tensorforce_kwargs, dict): for key, value in tensorforce_kwargs.items(): if key in agent_kwargs: agent_kwargs[key] = value self.agent = TFPPOAgent(states_spec=states_spec, actions_spec=actions_spec, summary_spec=summary_spec, scope="ppo", **agent_kwargs) try: self.restore_model() except Exception: pass def generate_action(self, state, **kwargs): if isinstance(state, GameFrame): self.current_state = state.frame elif isinstance(state, GameFrameBuffer): self.current_state = np.stack( [game_frame.frame for game_frame in state.frames], axis=2) else: self.current_state = state action = self.agent.act(self.current_state) label = self.game_inputs_mapping[action] return label, self.game_inputs[label] def observe(self, reward=0, terminal=False, **kwargs): if self.current_state is None: return None if self.callbacks.get("before_observe") is not None: self.callbacks["before_observe"]() will_update = self.agent.batch_count == self.agent.batch_size - 1 if will_update: if self.callbacks.get("before_update") is not None: self.callbacks["before_update"]() self.agent.observe(reward=reward, terminal=terminal) self.save_model() if self.callbacks.get("after_update") is not None: self.callbacks["after_update"]() else: self.agent.observe(reward=reward, terminal=terminal) self.current_state = None self.current_reward = reward self.cumulative_reward += reward if self.callbacks.get("after_observe") is not None: self.callbacks["after_observe"]() def save_model(self): self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets", self.name, self.name), append_timestep=False) def restore_model(self): self.agent.restore_model( directory=os.path.join(os.getcwd(), "datasets", self.name))
print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1)) print("Episode reward: {}".format(r.episode_rewards[-1])) print("Average of last 10 rewards: {}".format(np.mean(r.episode_rewards[-10:]))) return True runner = Runner(agent, environment) runner.run(num_timesteps=3600, num_episodes=3, episode_finished= episode_finished) # Poll new state from client #for outsideTemperature in loganOutsideTemperatures: for i in range(2): outsideTemperature = 1.1 # iterate through one hour with the same temperature for i in range(3600): state = hvacBuilding.get_state(outsideTemperature) action = agent.act(state, True) reward = hvacBuilding.Act(action) agent.observe(reward=reward, terminal=False) hvacBuilding.step(outsideTemperature) #currently the only state is to turn on cooling or turn off # if not hvac.HeatingIsShuttingDown and hvac.HeatingIsOn and hvacBuilding.current_temperature > 18.8889:#21: # #print("Turning the Heater Off") # hvac.TurnHeatingOff() # if hvac.HeatingIsOn == False and hvacBuilding.current_temperature < 17.7778:#17: # #print("Turning the Heater On") # numberOfHeatingOn = numberOfHeatingOn + 1 # hvac.TurnHeatingOn()
print("agents made") monkey = [] rl_ppo = [] rl_dqn = [] rl_vpg = [] #training for i in tqdm(range(5000)): infrastructure.initializeGraph() while infrastructure.attempts < len(infrastructure.peers): #agent_ppo actions state = infrastructure.get_state() action = agent_ppo.act(state) action = action.values() #print("ai", action) reward = infrastructure.shutdown(action) if infrastructure.attempts < infrastructure.peers: agent_ppo.observe(reward=reward, terminal=False) else: agent_ppo.observe(reward=reward, terminal=True) rl_ppo.append(reward) #dqn agent action = agent_dqn.act(state) action = action.values()
success = False while True: latent_vector = vae.get_vector(observation.reshape(1, 48, 64, 3)) latent_vector = list(itertools.chain(*latent_vector)) # [[ ]] -> [ ] relative_pos = GazeboMaze.p previous_act = GazeboMaze.vel_cmd print(previous_act) # state = latent_vector + relative_pos + previous_act state = dict(latent_vector=latent_vector, previous_act=previous_act, relative_pos=relative_pos) # print(state) # Query the agent for its action decision action = agent.act(state, deterministic=deterministic) # Execute the decision and retrieve the current information observation, terminal, reward = GazeboMaze.execute(action) observation = observation / 255.0 # normalize # print(reward) # Pass feedback about performance (and termination) to the agent agent.observe(terminal=terminal, reward=reward) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: success = GazeboMaze.success break episode += 1 total_timestep += timestep # avg_reward = float(episode_reward)/timestep
def main(): env = gym.make('Breakout-v0') # (210, 160, 3) print(env.observation_space.shape) # [[[255...]]] print(env.observation_space.high) # [[[0...]]] print(env.observation_space.low) # 4 print(env.action_space.n) agent = PPOAgent( # (210, 160, 3) states=dict(type='float', shape=env.observation_space.shape), network=[ # (51, 29, 32) dict(type='conv2d', size=32, window=8, stride=4, activation='relu'), # (24, 18, 64) dict(type='conv2d', size=64, window=4, stride=2, activation='relu'), # (22, 16, 64) dict(type='conv2d', size=64, window=3, stride=1, activation='relu'), # 22528 dict(type='flatten'), dict(type='dense', size=512, activation='relu'), dict(type='dense', size=32, activation='relu'), ], # batching_capacity=10, memory=dict( type='latest', include_next_states=False, capacity=1000, ), # update=dict(unit='timesteps', batch_size=64), actions=dict(type='int', num_actions=env.action_space.n), step_optimizer=dict(type='adam', learning_rate=1e-4)) model_dir = 'models/breakout' # load model if os.path.exists(f'{model_dir}/checkpoint'): agent.restore_model(directory=model_dir) try: for step in range(100000): observation = env.reset() done = False step_reward = 0 while not done: # env.render() # from PIL import Image # pil_img = Image.fromarray(observation) # pil_img.save('./observation.png') states = observation / 256 action = agent.act(states=states) observation, reward, done, info = env.step(action) reward = reward / 10 agent.observe(reward=reward, terminal=done) step_reward += reward if done: print(f'step = {step}, reward = {step_reward}') except Exception as e: raise e finally: agent.save_model(directory=f'{model_dir}/agent')
success = False if GazeboMaze.goal not in config.test_space[maze_id]: # train while True: relative_pos = GazeboMaze.p previous_act = GazeboMaze.vel_cmd previous_reward = GazeboMaze.reward print(previous_act) state = dict(image=observation, relative_pos=relative_pos, previous_act=previous_act, previous_reward=[previous_reward]) # state = dict(image=observation, previous_act=GazeboMaze.vel_cmd, relative_pos=GazeboMaze.p) # Query the agent for its action decision action = agent.act(state) # Execute the decision and retrieve the current information observation, terminal, reward = GazeboMaze.execute(action) observation = observation / 255.0 # normalize # print(reward) # Pass feedback about performance (and termination) to the agent agent.observe(terminal=terminal, reward=reward) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: success = GazeboMaze.success break episode += 1 total_timestep += timestep # avg_reward = float(episode_reward)/timestep
done = False agent.reset() while simulation.gameOver( ) == False and turns < 100 and bad_move_count < 150: # print(player) counter += 1 # if player done, continue state = simulation.get_state() state.append(float(turns)) state = [state] moved = False # print(len(state)) # exit() # print(simulation.players) # print(simulation.item_watch()) action = agent.act(np.asarray(state)) # print(action) if simulation.move_check(player, action): #print('GOOD MOVE') turns += 1 moved = True # print(player) old_pos, new_pos = simulation.movePlayer(player, action) #print(old_reward) reward = simulation.reward_2(old_pos, new_pos, player, saved_pos) #print('factor streak in player' + str(factor_streak)) #print(map_num) print(str(old_pos) + " " + str(new_pos) + " " + str(reward)) # Update Items simulation.item_update(new_pos) # Update Statuses
states={"type":'float', "shape": infrastructure.graph.shape }, actions={ str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(infrastructure.clients) }, network=[ dict(type='flatten'), dict(type="dense", size=32), dict(type="dense", size=32), dict(type="dense", size=32) ], ) for i in tqdm(range(100000)): state = infrastructure.graph action_monkey = monkey.act(state).values() action_manager = manager.act(state) action_manager_matrix = np.full((infrastructure.servers, infrastructure.clients), 0) for item in action_manager.items(): clientID = int(item[0]) serverID = item[1] action_manager_matrix[serverID][clientID] = 1 for x in range(infrastructure.servers): for y in range(infrastructure.clients): if x == y: infrastructure.graph[x][y] = 1 reward = infrastructure.reward(action_monkey, action_manager_matrix) monkey.observe(reward=reward, terminal=False)
latent_vector = vae.get_vector(observation.reshape(1, 48, 64, 3)) latent_vector = list( itertools.chain(*latent_vector)) # [[ ]] -> [ ] relative_pos = GazeboMaze.p previous_act = GazeboMaze.vel_cmd previous_reward = GazeboMaze.reward print(previous_act) # state = latent_vector + relative_pos + previous_act + [previous_reward] state = dict(latent_vector=latent_vector, relative_pos=relative_pos, previous_act=previous_act, previous_reward=[previous_reward]) # print(state) # Query the agent for its action decision action = agent.act(state, deterministic=deterministic) # Execute the decision and retrieve the current information observation, terminal, reward = GazeboMaze.execute(action) observation = observation / 255.0 # normalize # print(reward) # Pass feedback about performance (and termination) to the agent agent.observe(terminal=terminal, reward=reward) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: success = GazeboMaze.success break episode += 1 total_timestep += timestep # avg_reward = float(episode_reward)/timestep
class Controller: def __init__(self, apikey, agent_id, frames_per_state=1, host=None): # PPO agent seems to learn that it needs to speed around the environment to collect rewards self._agent = PPOAgent( states_spec=dict(type='float', shape=(frames_per_state * 25, )), actions_spec=dict(type='float', shape=(3, ), min_value=np.float32(-1.0), max_value=np.float32(1.0)), network_spec=[ dict(type='dense', activation='relu', size=128), dict(type='dense', activation='relu', size=128), ], optimization_steps=5, # Model scope='ppo', discount=0.99, # DistributionModel distributions_spec=None, entropy_regularization=0.01, # PGModel baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, # PGLRModel likelihood_ratio_clipping=0.2, summary_spec=None, distributed_spec=None, batch_size=2048, step_optimizer=dict(type='adam', learning_rate=1e-4)) self._logger = setup_custom_logger("Controller") self._frame_count_per_episode = 0 self._total_frames = 1 self._frames_per_state = frames_per_state self._client = AsyncClient(apikey, agent_id, self._train_state_callback, host) self._state_stack = StateStack(self._frames_per_state) async def _train_state_callback(self, state, reward, error): terminal = False # We are controlling the the episode to be terminal if either # 1. the agent gets a reward in the environment # 2. the agent has not had a reward for _frame_count_per_episode states from the environment if reward != 0.0: reward = reward * 20.0 terminal = True self._frame_count_per_episode = 0 print("terminal, got reward - %.2f" % reward) elif self._frame_count_per_episode == self._max_frame_count_per_episode: reward = -100.0 terminal = True self._frame_count_per_episode = 0 print("terminal, killing") self._state_stack.add_state(state[11:]) if self._total_frames > self._frames_per_state: combined_state = self._state_stack.get_combined_state() # Currently ignoring the first 11 states as they are sensor for other agents in the environment action = self._agent.act(combined_state) self._agent.observe(reward=reward, terminal=terminal) # Only let the mbot travel forwards action[0] = (action[0] + 1.0) / 3.0 self.total_rewards[self._total_frames] = reward await self._client.send_agent_action(action) if self._total_frames % 100 == 0: self._logger.info( "%d iterations: Running AVG reward per last %d states: %.2f" % (self._total_frames, self._max_frame_count_per_episode, self.total_rewards[max(0, self._total_frames - 10000):self._total_frames].mean())) self._total_frames += 1 self._frame_count_per_episode += 1 if self._total_frames >= self.max_iterations: self._client.stop() def train(self, max_iterations, max_frame_count_per_episode=1000): """ :param max_iterations: the maximum iterations across all episodes :param max_frame_count_per_episode: we control how the episodes are handled :return: """ self._max_frame_count_per_episode = max_frame_count_per_episode self.max_iterations = max_iterations self.total_rewards = np.zeros(max_iterations) self.total_costs = np.zeros(max_iterations) self._client.start()
class SerpentPPO: def __init__(self, frame_shape=None, game_inputs=None): if frame_shape is None: raise SerpentError("A 'frame_shape' tuple kwarg is required...") states_spec = {"type": "float", "shape": frame_shape} if game_inputs is None: raise SerpentError("A 'game_inputs' dict kwarg is required...") self.game_inputs = game_inputs self.game_inputs_mapping = self._generate_game_inputs_mapping() print('game inputs mapping:') print(self.game_inputs_mapping) actions_spec = {"type": "int", "num_values": len(self.game_inputs)} summary_spec = { "directory": "./board/", "steps": 50, "labels": [ "configuration", "gradients_scalar", "regularization", "inputs", "losses", "variables" ] } network_spec = [{ "type": "conv2d", "size": 16, "window": 8, "stride": 4 }, { "type": "conv2d", "size": 32, "window": 4, "stride": 2 }, { "type": "conv2d", "size": 32, "window": 3, "stride": 1 }, { "type": "flatten" }, { "type": "dense", "size": 64 }] baseline_spec = { "type": "cnn", "conv_sizes": [32, 32], "dense_sizes": [32] } saver_spec = { "directory": os.path.join(os.getcwd(), "datasets", "t4androidmodel"), "seconds": 120 } # memory_spec = {'type':'latest', 'include_next_states':False, 'capacity':1000*1000} self.agent = PPOAgent( states=states_spec, actions=actions_spec, network=network_spec, # baseline_mode='states', # baseline=baseline_spec, summarizer=summary_spec, memory=10, update_mode=dict(unit='timesteps', batch_size=2), discount=0.97, saver=saver_spec) self.agent.initialize() # # batched_observe=2560, # scope="ppo", # summarizer=summary_spec, # network=network_spec, # device=None, # session_config=None, # saver_spec=None, # distributed_spec=None, # discount=0.97, # variable_noise=None, # states_preprocessing_spec=None, # explorations_spec=None, # reward_preprocessing_spec=None, # distributions_spec=None, # entropy_regularization=0.01, # batch_size=2560, # keep_last_timestep=True, # baseline_mode=None, # baseline=None, # baseline_optimizer=None, # gae_lambda=None, # likelihood_ratio_clipping=None, # step_optimizer=None, # optimization_steps=10 # # ) def generate_action(self, game_frame_buffer): states = np.stack(game_frame_buffer, axis=2) action = self.agent.act(states) label = self.game_inputs_mapping[action] return action, label, self.game_inputs[label] def observe(self, reward=0, terminal=False): self.agent.observe(reward=reward, terminal=terminal) def _generate_game_inputs_mapping(self): mapping = dict() for index, key in enumerate(self.game_inputs): mapping[index] = key return mapping
pbar = tqdm.tqdm(total=nprocs * batch_allocation) # Run this single worker (episode loop) as long as episode threshold have not been reached. while not should_stop: state = env.reset() #print('Calling reset') agent.reset() #print('Reset resolved') episode_reward = 0 # Time step (within episode) loop time_step = 0 time_start = time.time() while True: #print('Calling act') action, internals, states = agent.act(states=state, deterministic=deterministic, buffered=False, independent=True) #print('Act resolved') reward = 0 for repeat in xrange(repeat_actions): state, terminal, step_reward = env.execute(action=action) reward += step_reward if terminal: break time_step += 1 episode_reward += reward data_buffer.append((state, action, internals, reward, terminal)) if terminal or time_step == max_episode_timesteps: break
def main(xml_name): with open(xml_name) as xmlf: xml_str = xmlf.read() gen = DummyGen() gen.override_from_xml(xml_str) _DEFAULT_TIME_LIMIT = 10 _CONTROL_TIMESTEP = .04 display_stride = 1 / .04 // 24 genesis_physics = Physics.from_xml_string(common.read_model(os.path.join(os.getcwd(), xml_name)), common.ASSETS) genesis_physics.set_genesis(gen) genesis_task = FindTarget() genesis_env = control.Environment(genesis_physics, genesis_task, control_timestep=_CONTROL_TIMESTEP, time_limit=_DEFAULT_TIME_LIMIT) action_spec = genesis_env.action_spec() observation_spec = genesis_env.observation_spec() observation_shape = np.array([0]) for (name, row) in observation_spec.items(): print (name, observation_shape, row.shape) if(row.shape == ()): observation_shape[0] += 1 continue print(row.shape) observation_shape[0] += row.shape[0] observation_shape = (observation_shape[0],) print(action_spec) print(action_spec.minimum) agent = PPOAgent( states=dict(type='float', min_value=action_spec.minimum, max_value=action_spec.maximum, shape=observation_shape), actions=dict(type='float', min_value=action_spec.minimum, max_value=action_spec.maximum, shape=action_spec.shape), network=[ dict(type='dense', size=128, activation='relu'), dict(type='dense', size=64, activation='relu'), dict(type='dense', size=16, activation='tanh') ], step_optimizer={ "type": "adam", "learning_rate": 1e-4 }, entropy_regularization=0.01, batching_capacity=64, subsampling_fraction=0.1, optimization_steps=50, discount=0.99, likelihood_ratio_clipping=0.2, baseline_mode="states", baseline={ "type":"mlp", "sizes": [32, 32] }, baseline_optimizer={ "type":"multi_step", "optimizer": { "type": "adam", "learning_rate": 1e-4 }, "num_steps": 5 }, update_mode={ "unit": "episodes", "batch_size": 128, "frequency": 10 }, memory={ "type": "latest", "include_next_states": False, "capacity": 2000 } ) time_step = genesis_env.reset() curtime = 0.0 top_view = genesis_env.physics.render(480, 480, camera_id='tracking_top') side_view = genesis_env.physics.render(480, 480, camera_id='arm_eye') did_except = False NUM_EPISODES = 10000 N_INPROG_VIDS = 4 VID_EVERY = NUM_EPISODES // N_INPROG_VIDS for i in tqdm.tqdm(range(NUM_EPISODES)): time_step = genesis_env.reset() j = 0 tot = 0 reward = [] while not time_step.last(): state = observation2state(time_step.observation) action = agent.act(state) time_step = genesis_env.step(action) tot += time_step.reward reward.append(time_step.reward) agent.observe(reward=time_step.reward, terminal=time_step.last()) if(j % 50 == 0 and i % 25 == 1): pass #clear_output() #img = plt.imshow(np.array(env.physics.render(480, 640)).reshape(480, 640, 3)) #plt.pause(0.5) j += 1 if i % 100 == 0: #tot /= j tqdm.tqdm.write("for episode " + str(i) + " : " + str(tot)) if (i % VID_EVERY) == 0 or i == NUM_EPISODES - 1: agent.save_model('./models/starfish_model_target') time_step = genesis_env.reset() vid_suffix = str(i) if i == NUM_EPISODES - 1: vid_suffix = 'final' vid_name = 'videos/starfish_{}.mp4'.format(vid_suffix) imnames = set() picidx = 0 curtime = 0.0 while not time_step.last(): try: state = observation2state(time_step.observation) action = agent.act(state) time_step = genesis_env.step(action) savename = "/tmp/starfish_{0:04}.jpg".format(picidx) picidx += 1 imnames.add(savename) curtime += _CONTROL_TIMESTEP top_view = genesis_env.physics.render(480, 480, camera_id='tracking_top') side_view = genesis_env.physics.render(480, 480, camera_id='arm_eye') #plt.imshow(np.concatenate((top_view, side_view), axis=1)) #plt.pause(0.5) io.imsave(savename, np.concatenate((top_view, side_view), axis=1)) except PhysicsError: print('except') did_except = True break if os.path.isfile(vid_name): os.remove(vid_name) if not did_except: os.system('ffmpeg -nostats -loglevel 0 -f image2 -pattern_type sequence -i "/tmp/starfish_%4d.jpg" -qscale:v 0 {}'.format(vid_name)) for name in imnames: os.remove(name) print("recorded video")
agent.restore(directory="saved/" + args.agent + "/" + args.contrarian) print("restored") except: lastEpoch = 0 epochs = 100000 cluster_vals = [] for epoch in tqdm(range(lastEpoch, epochs)): G = Audience(20, 15) #20 reccomendations for every user training_size = G.graph.shape[0] * 20 changes = [] for step in range(training_size): action = agent.act(G.graph) reward = G.recommendation(action["user"], action["item"]) #reward = weight * reward + weight * change #if contrarian get this if args.contrarian == "on": cluster_val = G.clustering() + 0.01 cluster_vals.append(cluster_val) # print(reward, cluster_val, reward / cluster_val) reward = reward / cluster_val #change if (len(cluster_vals) % 10) == 0: if len(cluster_vals) > 0:
def main(): ''' Train an agent. Note that I've created a custom OpenAI Gym environment to allow for quick plug and play in comparing performance across different RL models. ''' env = gym.make( 'Trade-v0', window=50, datadir='stocks/s_coinbaseUSD_1_min_data_2014-12-01_to_2018-11-11.csv', preprocesses=['MinMax']) network_spec = [ dict(type='flatten'), dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ] agent = PPOAgent( states=env.observation_space, actions=env.action_space, network=network_spec, step_optimizer=dict(type='adam', learning_rate=1e-3), optimization_steps=10, scope='ppo', discount=0.99, entropy_regularization=0.01, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, likelihood_ratio_clipping=0.2, ) runner = Runner(agent=agent, environment=env) def episode_finished(r): print("Finished episode {ep} after {ts} timesteps (reward: {reward})". format(ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) return True runner.run(episodes=10, episode_finished=episode_finished) print( "Learning finished. Total episodes: {ep}. Average reward of last 10 episodes (of 10): {ar}." .format(ep=runner.episode, ar=np.mean(runner.episode_rewards[-5:]))) print('Testing for an episode...') s = env.reset() collectables = [] while True: action = agent.act(s) s, r, d, i = env.step(action) agent.observe(reward=r, terminal=d) collectables.append( (s[0][0], action)) # to be replaced by env.render() when i get it fixed if d: break plot(collectables, 0.001) # plot only .1% of one episode
class ForwardActorSimple: def __init__(self): actions = {} actions_exp = {} for i in range(12): actions[str(i)] = {'type': 'float'} # 'num_actions': 10 actions_exp[str(i)] = dict(type='ornstein_uhlenbeck', sigma=0.1, mu=0.0, theta=0.1) preprocessing_config = [{"type": "standardize"}] preprocessing_config = None customnet = dict(type=CustomNetwork) layerSize = 300 network_spec = [ dict(type='dense', size=100), dict(type='lstm', size=100) ] ''' network_spec = [ dict(type='dense', size=100), dict(type='internal_lstm', size=100) ] ''' network_spec = [ dict(type='dense', size=layerSize, activation='selu'), dict(type='dense', size=layerSize, activation='selu'), dict(type='dense', size=layerSize, activation='selu') ] self.agent = PPOAgent( states=dict(type='float', shape=(12 + 9, )), actions=actions, batching_capacity=1000, network=network_spec, states_preprocessing=preprocessing_config, actions_exploration=actions_exp, step_optimizer=dict(type='adam', learning_rate=1e-5), ) def act(self, state): jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])), axis=0) #jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0) orient = np.expand_dims(np.array(state["bodyRot"]), axis=0) actiondict = self.agent.act( np.nan_to_num(np.concatenate([jp, orient], axis=1)) / 5.0) #actiondict = self.agent.act(jp) action = np.zeros(12) for i in range(12): action[i] = actiondict[str(i)][0] action = np.nan_to_num(action) #print(action) return np.clip(action, -1.0, 1.0) def observe(self, reward, terminal): self.agent.observe(reward=reward, terminal=terminal) def save(self, directory): self.agent.save_model(directory=directory) def restore(self, directory): self.agent.restore_model(directory=directory)
class SerpentPPO: def __init__(self, frame_shape=None, game_inputs=None): if frame_shape is None: raise SerpentError("A 'frame_shape' tuple kwarg is required...") states_spec = {"type": "float", "shape": frame_shape} if game_inputs is None: raise SerpentError("A 'game_inputs' dict kwarg is required...") self.game_inputs = game_inputs self.game_inputs_mapping = self._generate_game_inputs_mapping() actions_spec = {"type": "int", "num_actions": len(self.game_inputs)} network_spec = [ {"type": "conv2d", "size": 1, "window": 2, "stride": 1}, {"type": "flatten"}, # {"type": "dense", "size": 64}, {"type": "dense", "size": 6} ] self.agent = PPOAgent( states=states_spec, actions=actions_spec, network=network_spec, batched_observe=256, batching_capacity=1000, # BatchAgent #keep_last_timestep=True, # PPOAgent step_optimizer=dict( type='adam', learning_rate=1e-4 ), optimization_steps=10, # Model scope='ppo' #discount=0.97, # DistributionModel #distributions=None, #entropy_regularization=0.01, # PGModel #baseline_mode=None, #baseline=None, #baseline_optimizer=None, #gae_lambda=None, # PGLRModel #likelihood_ratio_clipping=None, #summary_spec=summary_spec, #distributed_spec=None, # More info #device=None, #session_config=None, #saver=None, #variable_noise=None, #states_preprocessing_spec=None, #explorations_spec=None, #reward_preprocessing_spec=None, #execution=None, #actions_exploration=None, #update_mode=None, #memory=None, #subsampling_fraction=0.1 ) def generate_action(self, game_frame_buffer): states = np.stack( game_frame_buffer, axis=2 ) # Get prediction from agent, execute action = self.agent.act(states) label = self.game_inputs_mapping[action] return action, label, self.game_inputs[label] def observe(self, reward=0, terminal=False): self.agent.observe(reward=reward, terminal=terminal) def _generate_game_inputs_mapping(self): mapping = dict() for index, key in enumerate(self.game_inputs): mapping[index] = key return mapping def save_model(self): self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman", "ppo_model"), append_timestep=False) def restore_model(self): self.agent.restore_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman"))
optimization_steps=20) ''' batching_capacity=200, step_optimizer=dict( type='adadelta', learning_rate=1e-3) ''' allRewards = np.zeros(shape=(1, 1)) for game in range(NUM_GAMES_TO_PLAY): obs = env.reset() gameTotalReward = 0 for step in range(1000): env.render() a = agent.act(obs) #print("ACTION ->",a) if CLIP_ACTION: for i in range(np.alen(a)): if a[i] < -1: a[i] = -0.99999999999 if a[i] > 1: a[i] = 0.99999999999 obs, reward, done, info = env.step(a) #reward = reward/100 gameTotalReward = gameTotalReward + reward allRewards = np.vstack((allRewards, np.array([reward]))) if done: agent.observe(reward=reward, terminal=True) else: agent.observe(reward=reward, terminal=False) #print("Action: {} Observations Size:{} score: {}".format(a,obs.shape,reward))
) reward_list = [] args_episodes = 1000 args_episode_max_steps = 200 episode = 0 agent.reset() while True: agent.reset() state = env.reset() episode += 1 episode_step = 0 episode_reward = 0 while True: action = agent.act(state) state, terminal, reward = env.execute(action) reward = np.abs(state[1]) - 0.05 episode_reward += reward episode_step += 1 if args_episode_max_steps is not None and episode_step >= args_episode_max_steps: terminal = True agent.observe(terminal, reward) if terminal: break print('episode {0} steps {1} reward {2}'.format(episode, episode_step, episode_reward)) reward_list.append(episode_reward) if episode >= args_episodes: break # if len(reward_list) > 100 and np.mean(reward_list[-100:]) > 199:
if state_downscaled is not None: ax.imshow(state_downscaled) anim = animation.FuncAnimation(fig, animate, interval=100) plt.show() threading.Thread(target=anim_thread).start() for step in range(500000): if done: state = env.reset() # state.shape = 240, 256, 3 state_cutted = state[:, 85:215] state_downscaled = state_cutted[6::12, 6::12] action = agent.act(state_downscaled) state, reward, done, info = env.step(action) # Train the agent model agent.observe(reward=reward, terminal=False) if step % 100 == 0: log.debug('state {}: %s'.format(type(state)), state.shape) log.debug('reward {}: %s'.format(type(reward)), reward) log.debug('done {}: %s'.format(type(done)), done) log.debug('info {}: %s'.format(type(info)), info) log.debug('_y_pos {}: %s'.format(type(_env._y_position)), _env._y_position) env.render()
class SerpentPPO: def __init__(self, frame_shape=None, game_inputs=None): if frame_shape is None: raise SerpentError("A 'frame_shape' tuple kwarg is required...") states_spec = {"type": "float", "shape": frame_shape} if game_inputs is None: raise SerpentError("A 'game_inputs' dict kwarg is required...") self.game_inputs = game_inputs self.game_inputs_mapping = self._generate_game_inputs_mapping() actions_spec = {"type": "int", "num_actions": len(self.game_inputs)} network_spec = [{ "type": "conv2d", "size": 32, "window": 8, "stride": 4 }, { "type": "conv2d", "size": 64, "window": 4, "stride": 2 }, { "type": "conv2d", "size": 64, "window": 3, "stride": 1 }, { "type": "flatten" }, { "type": "dense", "size": 512 }] self.agent = PPOAgent( states_spec=states_spec, actions_spec=actions_spec, batched_observe=128, scope="ppo", summary_spec=None, network_spec=network_spec, device=None, session_config=None, saver_spec=None, distributed_spec=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=1e-2, batch_size=128, keep_last_timestep=True, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, likelihood_ratio_clipping=None, step_optimizer=None, #optimization_steps=10 ) def generate_action(self, game_frame_buffer): states = np.stack( [game_frame.frame for game_frame in game_frame_buffer.frames], axis=2) action = self.agent.act(states) label = self.game_inputs_mapping[action] return action, label, self.game_inputs[label] def observe(self, reward=0, terminal=False): self.agent.observe(reward=reward, terminal=terminal) def _generate_game_inputs_mapping(self): mapping = dict() for index, key in enumerate(self.game_inputs): mapping[index] = key return mapping
class Product: def __init__(self, name, light, price, quantity, avg_cost_estimate): # initalize product self.name = name # initialize state self.light = light self.quantity = quantity self.avg_cost_estimate = avg_cost_estimate # the approximated cost of each item sold self.price = price # what the price is being set at self.history_log = [] # history of product over time # initalize agent self.agent = PPOAgent( states=dict(type='float', shape=(4)), actions=dict(type='int', num_actions=len(PRICE_CHANGES)), network=[dict(type='dense', size=4), dict(type='dense', size=4)], step_optimizer=dict(type='adam', learning_rate=0.01)) self.agent.initialize_model() def get_history_log(self): return self.history_log def get_quantity(self): return self.quantity def get_avg_cost_estimate(self): return self.avg_cost_estimate def get_light(self): return self.light def get_price(self): return self.price def get_recommended_price(self): return max( 0, self.price + PRICE_CHANGES[self.agent.act(states=( self.light, self.price, self.quantity, self.avg_cost_estimate), deterministic=True, independent=True)]) def set_light(self, new_light): self.history_log.append("light=" + str(new_light)) self.light = new_light def set_price(self, new_price): self.history_log.append("price=" + str(new_price)) self.price = new_price def update_price(self): self.history_log.append("price=" + str(new_price)) self.price = max( 0, self.price + PRICE_CHANGES[self.agent.act(states=( self.light, self.price, self.quantity, self.avg_cost_estimate), deterministic=True, independent=True)]) def record_delivery(self, delivery_quantity, delivery_cost_per_item): self.history_log.append("delivery," + str(delivery_quantity) + str(delivery_cost_per_item)) # increase quantity as per size of delivery self.quantity += delivery_quantity # update cost # TODO improve algorithm self.cost = delivery_cost_per_item def record_sale(self, sale_quantity): self.history_log.append("sale," + str(sale_quantity)) # decrease quantity as per size of sale self.quantity -= sale_quantity # calculate approximate profit per item of sale avg_profit_estimate = sale_quantity * (self.price - self.avg_cost_estimate) self.agent.act(states=(self.light, self.price, self.quantity, self.avg_cost_estimate), deterministic=False, independent=False) self.agent.observe(reward=avg_profit_estimate, terminal=False)