from utils import serialize_floats from stable_baselines.deepq.replay_buffer import ReplayBuffer from envs.state import State store_id = get_store_id(cfg.vals["train_data"]) TIME_STEPS = cfg.vals["episode_len"] prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True) results = {'rewards': [0.0]} buffer = ReplayBuffer(size=50000) obs = env.reset() for i in range(TIME_STEPS): action = env.action_space.sample() proposed_action = AllocationEnv.check_action(obs['board_config'], action) new_obs, rew, dones, info = env.step(proposed_action) if rew == -1: action = 0 print("Timestep: {}".format(i)) print("action: {} - reward: {}".format(action, rew)) print(obs['day_vec']) print(obs['board_config']) results['rewards'].append(rew + results['rewards'][-1]) # add (s, a, r, s') to buffer buffer.add(obs_t=State.get_vec_observation(obs), action=action, reward=rew,
results = {'rewards': [0.0]} buffer = ReplayBuffer(size=50000) for j in range(100): obs = env.reset() for i in range(TEST_T): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask(feasible_actions, n_actions) action, _states = model.predict(obs, mask=action_mask) action = AllocationEnv.check_action(obs['board_config'], action) new_obs, r, dones, info = env.step([action]) results['rewards'].append(r[0] + results['rewards'][-1]) # add (s, a, r, s') to buffer buffer.add(obs_t=State.get_vec_observation(obs), action=action, reward=r[0], obs_tp1=State.get_vec_observation(new_obs), done=float(dones)) obs = new_obs with open("output/rl-test-{}.json".format(cfg.vals['prj_name']), 'w') as f: json.dump(results, f)
from envs.allocation_env import AllocationEnv import config.config as cfg import matplotlib.pyplot as plt from utils import serialize_floats import json TIME_STEPS = cfg.vals["episode_len"] prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True) results = {'rewards': [0.0]} obs = env.reset() for i in range(TIME_STEPS): action = 0 action = AllocationEnv.check_action(obs['board_config'], action) obs, rew, dones, info = env.step(action) print("Timestep: {}".format(i)) print("action: {} - reward: {}".format(action, rew)) print(obs['day_vec']) print(obs['board_config']) results['rewards'].append(rew + results['rewards'][-1]) print(results['rewards']) x = np.arange(TIME_STEPS+1) plt.plot(x, results['rewards']) plt.xlabel("Timestep (t)") plt.ylabel("Cumulative Reward") plt.savefig("figs/naive-policy-{}.png".format(cfg.vals['prj_name']))