from utils import serialize_floats
from stable_baselines.deepq.replay_buffer import ReplayBuffer
from envs.state import State

store_id = get_store_id(cfg.vals["train_data"])
TIME_STEPS = cfg.vals["episode_len"]
prior = Prior(config=cfg.vals)
env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True)
results = {'rewards': [0.0]}
buffer = ReplayBuffer(size=50000)

obs = env.reset()
for i in range(TIME_STEPS):
    action = env.action_space.sample()
    proposed_action = AllocationEnv.check_action(obs['board_config'], action)
    new_obs, rew, dones, info = env.step(proposed_action)

    if rew == -1:
        action = 0

    print("Timestep: {}".format(i))
    print("action: {} - reward: {}".format(action, rew))
    print(obs['day_vec'])
    print(obs['board_config'])

    results['rewards'].append(rew + results['rewards'][-1])

    # add (s, a, r, s') to buffer
    buffer.add(obs_t=State.get_vec_observation(obs),
               action=action,
               reward=rew,
results = {'rewards': [0.0]}
buffer = ReplayBuffer(size=50000)

for j in range(100):

    obs = env.reset()

    for i in range(TEST_T):
        feasible_actions = AllocationEnv.get_feasible_actions(
            obs["board_config"])
        action_mask = AllocationEnv.get_action_mask(feasible_actions,
                                                    n_actions)
        action, _states = model.predict(obs, mask=action_mask)

        action = AllocationEnv.check_action(obs['board_config'], action)
        new_obs, r, dones, info = env.step([action])

        results['rewards'].append(r[0] + results['rewards'][-1])

        # add (s, a, r, s') to buffer
        buffer.add(obs_t=State.get_vec_observation(obs),
                   action=action,
                   reward=r[0],
                   obs_tp1=State.get_vec_observation(new_obs),
                   done=float(dones))

        obs = new_obs

with open("output/rl-test-{}.json".format(cfg.vals['prj_name']), 'w') as f:
    json.dump(results, f)
from envs.allocation_env import AllocationEnv
import config.config as cfg
import matplotlib.pyplot as plt
from utils import serialize_floats
import json

TIME_STEPS = cfg.vals["episode_len"]
prior = Prior(config=cfg.vals)
env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True)
results = {'rewards': [0.0]}

obs = env.reset()
for i in range(TIME_STEPS):
    action = 0
    action = AllocationEnv.check_action(obs['board_config'], action)
    obs, rew, dones, info = env.step(action)
    print("Timestep: {}".format(i))
    print("action: {} - reward: {}".format(action, rew))
    print(obs['day_vec'])
    print(obs['board_config'])

    results['rewards'].append(rew + results['rewards'][-1])

print(results['rewards'])


x = np.arange(TIME_STEPS+1)
plt.plot(x, results['rewards'])
plt.xlabel("Timestep (t)")
plt.ylabel("Cumulative Reward")
plt.savefig("figs/naive-policy-{}.png".format(cfg.vals['prj_name']))