コード例 #1
0
ファイル: Trainer.py プロジェクト: RL4Inventory/RL4Inventory
    def eval(self, iter, eval_on_trainingset=False):
        self.switch_mode(eval=True)

        print(f"  == eval iteration {iter} == ")

        obss = self.env.reset(eval=True,
                              eval_on_trainingset=eval_on_trainingset)
        _, infos = self.env.state_calculator.world_to_state(self.env.world)
        rnn_states = {}
        rewards_all = {}
        episode_reward_all = {}
        episode_reward = {}
        episode_steps = []
        episode_step = 0

        tracker = SimulationTracker(self.env.done_step, 1,
                                    self.env.agent_ids())

        for agent_id in obss.keys():
            # policies[agent_id] = load_policy(agent_id)
            rnn_states[agent_id] = self.policies[agent_id].get_initial_state()
            rewards_all[agent_id] = []
            episode_reward_all[agent_id] = []
            episode_reward[agent_id] = 0

        for i in range(100000):
            episode_step += 1
            actions = {}
            # print("timestep : ", self.step)
            # print("Start calculate action ....")
            for agent_id, obs in obss.items():
                policy = self.policies[agent_id]
                action, new_state, _ = policy.compute_single_action(
                    obs,
                    state=rnn_states[agent_id],
                    info=infos[agent_id],
                    explore=False)
                actions[agent_id] = action
                # print(agent_id, " :", policy.__class__, " : ", action)
            next_obss, rewards, dones, infos = self.env.step(actions)

            for agent_id, reward in rewards.items():
                rewards_all[agent_id].append(reward)
                episode_reward[agent_id] += reward

            step_balances = {}
            for agent_id in rewards.keys():
                step_balances[agent_id] = self.env.world.facilities[
                    Utils.agentid_to_fid(
                        agent_id)].economy.step_balance.total()
            # print(env.world.economy.global_balance().total(), step_balances, rewards)
            tracker.add_sample(0, episode_step - 1,
                               self.env.world.economy.global_balance().total(),
                               step_balances, rewards)

            done = any(dones.values())

            if done:
                obss = self.env.reset(eval=True)
                episode_steps.append(episode_step)
                episode_step = 0
                for agent_id, reward in episode_reward.items():
                    episode_reward_all[agent_id].append(reward)
                    episode_reward[agent_id] = 0
                break
            else:
                obss = next_obss
        infos = {
            "rewards_all": rewards_all,
            "episode_reward_all": episode_reward_all,
            "epsilon": self.policies[self.policies_to_train[0]].epsilon,
            "all_step": self.step,
            "episode_step": sum(episode_steps) / len(episode_steps),
            "profit": tracker.get_retailer_profit(),
        }
        return infos
コード例 #2
0
def visualization(env, policies, iteration, policy_mode, basestock=False):

    policy_mode = policy_mode  # + f'_{iteration}'

    renderer = AsciiWorldRenderer()
    frame_seq = []

    evaluation_epoch_len = env.env_config['evaluation_len']
    starter_step = env.env_config['episod_duration']+env.env_config['tail_timesteps']
    env.set_iteration(1, 1)
    # env.env_config.update({'episod_duration': evaluation_epoch_len, 'downsampling_rate': 1})
    print(
        f"Environment: Producer action space {env.action_space_producer}, Consumer action space {env.action_space_consumer}, Observation space {env.observation_space}"
        , flush=True)
    obss = env.reset()
    if basestock:
        from scheduler.inventory_base_stock_policy import ConsumerBaseStockPolicy
        ConsumerBaseStockPolicy.facilities = env.world.facilities    

    if Utils.get_demand_sampler()=='ONLINE':
        env.set_retailer_step(starter_step)
    _, infos = env.state_calculator.world_to_state(env.world)


    # policies = {}
    rnn_states = {}
    rewards = {}
    for agent_id in obss.keys():
        # policies[agent_id] = load_policy(agent_id)
        rnn_states[agent_id] = policies[agent_id].get_initial_state()
        rewards[agent_id] = 0

    # Simulation loop
    tracker = SimulationTracker(evaluation_epoch_len, 1, env.agent_ids())
    print(f"  === evaluation length {evaluation_epoch_len}, it will take about 1 min ....", flush=True)

    for epoch in range(evaluation_epoch_len):
        action_dict = {}
        for agent_id, obs in obss.items():
            policy = policies[agent_id]
            action, new_state, _ = policy.compute_single_action(obs, state=rnn_states[agent_id], info=infos[agent_id],
                                                                explore=False)
            action_dict[agent_id] = action
            # if agent_id.startswith('SKUStoreUnit') and Utils.is_consumer_agent(agent_id):
            #     print(agent_id, action, rewards[agent_id])
            #     print(obs.tolist())
        obss, rewards, dones, infos = env.step(action_dict)
        step_balances = {}
        for agent_id in rewards.keys():
            step_balances[agent_id] = env.world.facilities[Utils.agentid_to_fid(agent_id)].economy.step_balance.total()
        # print(env.world.economy.global_balance().total(), step_balances, rewards)
        tracker.add_sample(0, epoch, env.world.economy.global_balance().total(), step_balances, rewards)
        # some stats
        stock_status = env.get_stock_status()
        order_in_transit_status = env.get_order_in_transit_status()
        demand_status = env.get_demand_status()

        tracker.add_sku_status(0, epoch, stock_status, order_in_transit_status, demand_status)

        frame = renderer.render(env.world)
        frame_seq.append(np.asarray(frame))

    print(tracker.get_retailer_profit())

    if not os.path.exists('output'):
        os.mkdir('output')

    if not os.path.exists('output/%s' % policy_mode):
        os.mkdir('output/%s' % policy_mode)

    if not os.path.exists(f'output/{policy_mode}/iter_{iteration}'):
        os.mkdir(f'output/{policy_mode}/iter_{iteration}')

    # tracker.render("output/%s/plot.png" % policy_mode)
    tracker.render(f'output/{policy_mode}/iter_{iteration}/plot.png')
    tracker.render_sku(policy_mode, iteration)
    print(f"  === evaluation length end ", flush=True)