class TestRandomAgent(unittest.TestCase): def setUp(self): number_of_actions = 2 action_space = spaces.Discrete(2) self.agent = RandomAgent(action_space) def testAction(self): action = self.agent.act(state=None, reward=None, done=None) assert action == 0
def run_with_params(num_dcs, num_customers, dcs_per_customer, demand_mean, demand_var, num_commodities, orders_per_day, num_steps): physical_network = PhysicalNetwork( num_dcs, num_customers, dcs_per_customer, demand_mean, demand_var, num_commodities, ) # order_generator = NaiveOrderGenerator(num_dcs, num_customers, orders_per_day) order_generator = ActualOrderGenerator(physical_network, orders_per_day) generator = DirichletInventoryGenerator(physical_network) environment_parameters = EnvironmentParameters(physical_network, order_generator, generator, num_steps) env = ShippingFacilityEnvironment(environment_parameters) agent = RandomAgent(env) obs = env.reset() reward = 0 done = False print("=========== starting episode loop ===========") print("Initial environment: ") env.render() actions = [] episode_rewards = [] #demands_per_k = np.zeros((num_commodities,num_steps)) #inventory_at_t = np.zeros((num_commodities,num_steps)) #todo llenar estos eventualmente while not done: action = agent.act(obs, reward, done) # print(f"Agent is taking action: {action}") # the agent observes the first state and chooses an action # environment steps with the agent's action and returns new state and reward obs, reward, done, info = env.step(action) # print(f"Got reward {reward} done {done}") # Render the current state of the environment env.render() actions.append(action) episode_rewards.append(reward) if done: print("===========Environment says we are DONE ===========") return actions, episode_rewards
return processed_observation current_path = [] path_length = 0 path_return = 0. num_episodes = 1 for episode in range(1, num_episodes + 1): current_ob = env.reset() rewards = [] while True: action = agent.act(current_ob, reward, done) ob, reward, done, info = env.step(action) processed_sample = process_sample(observation=current_ob, action=action, reward=reward, terminal=done, next_observation=ob, info=info) current_path.append(processed_sample) env.render_rollouts(current_path) path_length += 1 if done or path_length > 50:
def main(args): # Make the environment. env = gym.make(args.env_id) # logging outdir = 'logs/secret_breakout' if args.monitoring: env = wrappers.Monitor(env, directory=outdir, video_callable=False, force=True) env.seed(args.seed) state = env.reset() # Get the action and observation space from the environment. logger.debug('Action space vector length: {}'.format(env.action_space.n)) # TODO: fix this so that it can be directly read from env.observation_space logger.debug('Observation space vector length: {}'.format(len(state))) logger.debug('Max episode steps: {}'.format(env.spec.max_episode_steps)) # Build the agent if args.agent_id == 'random': agent = RandomAgent(env.action_space) elif args.agent_id == 'reinforce': agent = ReinforceAgent(input_size=len(state), hidden_size=args.hidden_size, output_size=env.action_space.n, learning_rate=args.learning_rate, gamma=args.gamma) elif args.agent_id == 'ac': agent = ActorCriticAgent(input_size=len(state), hidden_size=args.hidden_size, output_size=env.action_space.n, learning_rate=5e-3, gamma=args.gamma) reward = 0 done = False ep_rewards = [] ep_start_time = time.time() for i_episode in range(args.max_episodes + 1): # Don't loop forever, add one to the env_max_steps # to make sure to take the final step state = env.reset() # keep track of the performance over the episode single_ep_cumulative_reward = 0 for step in range(env.spec.max_episode_steps): # get the next action from the agent action = agent.act(state, reward, done) # perform the action in the environment state, reward, done, info = env.step(action) # track the episode performance single_ep_cumulative_reward += reward if done: break # add the accumulated reward to list of episode returns ep_rewards.append(single_ep_cumulative_reward) # update reporting times ep_report_time = round(time.time() - ep_start_time, 2) ep_start_time = time.time() if i_episode % args.log_interval == 0: logger.info( 't(s): {}, ep: {}, R: {:.2f}, R_av_5: {:.2f}, i: {}'.format( ep_report_time, i_episode, ep_rewards[-1], np.mean(ep_rewards[-5:]), info)) env.close() return True