def create_random(elevators, floors):
    summaries = []
    house = produce_house(20, elevators, floors)
    env = ElevatorEnv(house)
    for i in range(100):
        house = produce_house(20, elevators, floors)
        env = ElevatorEnv(house)
        random_policy = RandomPolicy()
        step = 0
        while not env.is_end_of_day():
            random_action = random_policy.get_action(env)
            env.step(
                ElevatorEnvAction(env.next_elevator,
                                  ElevatorActionEnum(random_action)))
            step += 1
        summaries.append(env.get_summary())
    with open(
            f'{datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}'
            f"_mcts{0}"
            f"_floors{env.house.number_of_floors}"
            f"_elevs{len(env.house.elevators)}",
            "wb",
    ) as handle:
        pickle.dump(summaries, handle, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #2
0
def main(render: bool):
    from os import path

    run_name = "rp_2elev_3floor"

    logger = Logger(SummaryWriter(path.join("../../../runs", run_name)))
    yparams = YParams("../config.yaml", "default")
    config = yparams.hparams
    batch_count = (config["train"]["samples_per_iteration"] //
                   config["train"]["batch_size"])
    for i in range(config["train"]["iterations"]):
        summaries = []
        for episode in range(config["train"]["episodes"]):
            print(i)
            house = produce_house(
                elevator_capacity=10,
                number_of_elevators=2,
                number_of_floors=3,
            )
            # house = get_simple_house()

            env = ElevatorEnv(house)
            random_policy = RandomPolicy()
            step = 0
            while not env.is_end_of_day():
                random_action = random_policy.get_action(env)
                prev_time = env.house.time
                action = ElevatorEnvAction(env.next_elevator,
                                           ElevatorActionEnum(random_action))
                env.step(action)
                step += 1
                if render:
                    root_dir = os.path.dirname(os.path.abspath(__file__))
                    path = os.path.join(
                        root_dir,
                        "{}/../plots/run_{}/iteration{}".format(
                            root_dir, run_name, i),
                    )
                    Path(path).mkdir(parents=True, exist_ok=True)
                    env.render(method="file",
                               path=path,
                               prev_time=prev_time,
                               action=action)

            # print("Total reward at the end of day: {}".format(env.reward_acc))
            print(env.get_summary())
            summaries.append(env.get_summary())
        logger.write_episode_summaries(summaries, i * batch_count)
def main():
    for floors in range(3, 11):
        print_double(f"test houses with {floors} floors")
        for elevators in [1, 2]:
            print_double(f"test houses with {elevators} elevators")
            create_random(elevators, floors)
            for MCTS_SAMPLES in [10, 20, 50, 100, 200]:
                print_double(f"use {MCTS_SAMPLES} mcts samples")
                logging.basicConfig(level=logging.ERROR)
                house = produce_house(20, elevators, floors)

                env = ElevatorEnv(house)
                # env.render(method="matplotlib", step=0)
                generator = Generator(env, ranked_reward_buffer=None)

                factory = MultiProcessEpisodeFactory(generator)
                model = UniformModel()
                episodes = factory.create_episodes(
                    EPISODES,
                    PROCESSES,
                    MCTS_SAMPLES,
                    MCTS_TEMP,
                    MCTS_CPUCT,
                    MCTS_OBSERVATION_WEIGHT,
                    model,
                )
                summaries = [e[3] for e in episodes]

                print_double("")
                with open(
                        f'{datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}'
                        f"_mcts{MCTS_SAMPLES}"
                        f"_floors{env.house.number_of_floors}"
                        f"_elevs{len(env.house.elevators)}",
                        "wb",
                ) as handle:
                    pickle.dump(summaries,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                avg, stddev = combine_summaries(summaries)
                print_double(str(avg))
                print_double(str(stddev))
                print_double(
                    f"{MCTS_SAMPLES} mcts samples, mcts_temp: {MCTS_TEMP}, "
                    f"floors{env.house.number_of_floors}, elevs{len(env.house.elevators)}"
                )
                print_double("\n\n")
def main():
    house = produce_house(elevator_capacity=10,
                          number_of_elevators=3,
                          number_of_floors=7)
    env = ElevatorEnv(house)
    sample_observation = env.get_observation().as_array()
    house_obs_dims = sample_observation[0].shape[0]
    elevator_obs_dims = sample_observation[1].shape[0]
    model = NNModel(
        house_observation_dims=house_obs_dims,
        elevator_observation_dims=elevator_obs_dims,
        policy_dims=3,
    )
    model.eval()
    policy, value = model.get_policy_and_value(env)
    print(policy)
    print(value)
Beispiel #5
0
def learning_loop(
        config: Dict,
        run_name: str,
        yparams: YParams,
        time_out: timedelta = timedelta(hours=24),
):
    best_waiting_time = None

    start_time = datetime.now()

    logger = Logger(SummaryWriter(path.join(config["path"], run_name)))
    logger.write_hparams(yparams=yparams)
    eval_logging_process = EvaluationLoggingProcess(config, run_name)

    batch_count = (config["train"]["samples_per_iteration"] //
                   config["train"]["batch_size"])
    house = produce_house(
        elevator_capacity=config["house"]["elevator_capacity"],
        number_of_elevators=config["house"]["number_of_elevators"],
        number_of_floors=config["house"]["number_of_floors"],
    )

    env = ElevatorEnv(house)
    # env.render(method="matplotlib")

    replay_buffer = ReplayBuffer(capacity=config["replay_buffer"]["size"])
    ranked_reward_buffer = RankedRewardBuffer(
        capacity=config["ranked_reward"]["size"],
        threshold=config["ranked_reward"]["threshold"],
    )

    generator = Generator(env, ranked_reward_buffer)
    factory: EpisodeFactory
    if config["train"]["n_processes"] > 1:
        factory = MultiProcessEpisodeFactory(generator)
    else:
        factory = SingleProcessEpisodeFactory(generator)

    if config["pure_mcts"]:
        model = UniformModel()
    else:
        model = NNModel(
            house_observation_dims=env.get_observation().as_array()
            [0].shape[0],
            elevator_observation_dims=env.get_observation().as_array()
            [1].shape[0],
            policy_dims=ElevatorActionEnum.count(),
        )

    if config["pretrained_path"]:
        checkpoint = torch.load(config["pretrained_path"])
        env = checkpoint["environment"]
        model.load_state_dict(checkpoint["model_state_dict"])
        iteration_start = checkpoint["iteration_start"]
        replay_buffer = checkpoint["replay_buffer"]
        ranked_reward_buffer = checkpoint["ranked_reward_buffer"]
    else:
        iteration_start = 0

    for i in range(iteration_start, config["train"]["iterations"]):
        # stop after timeout
        if datetime.now() - start_time > time_out:
            print(f"stopping because of timeout after {time_out}")
            break

        print(f"\niteration {i}")
        if not config["offline_training"]:
            print(f"\niteration {i}: sampling started")
            episodes = factory.create_episodes(
                n_episodes=config["train"]["episodes"],
                n_processes=config["train"]["n_processes"],
                mcts_samples=config["mcts"]["samples"],
                mcts_temp=config["mcts"]["temp"],
                mcts_cpuct=config["mcts"]["cpuct"],
                mcts_observation_weight=config["mcts"]["observation_weight"],
                model=model,
            )

            summaries = []
            for episode_index, e in enumerate(episodes):
                observations, pis, total_reward, summary = e
                for j, pi in enumerate(pis):
                    sample = (observations[j], pi, total_reward)
                    replay_buffer.push(sample)
                summaries.append(summary)

            logger.write_episode_summaries(summaries, i * batch_count)
            waiting_time = accumulate_summaries(
                summaries,
                lambda x: sum(x) / len(x)).avg_waiting_time_per_person

            if best_waiting_time is None or waiting_time < best_waiting_time:
                best_waiting_time = waiting_time
                save_model(env, i, model, replay_buffer, ranked_reward_buffer,
                           config, run_name)

            if i > 0 and i % 3 == 0:
                logger.plot_summaries(False, i)

            if i > 0 and i % 10 == 0 and False:  # FIXME
                p = Process(
                    target=evaluation_process,
                    args=(generator, config, model, i, run_name,
                          eval_logging_process),
                )
                p.start()

        # TRAIN model
        if not config["pure_mcts"]:
            logs = train(model, replay_buffer, ranked_reward_buffer,
                         i * batch_count, config)
            logger.log_train(logs)

            if config["save_iterations"]:
                save_model(env, i, model, replay_buffer, ranked_reward_buffer,
                           config, run_name)