def run_random(stochastic=False, noisy=False, problem_id=0): if stochastic: env = virl.Epidemic(stochastic=True, noisy=noisy) else: env = virl.Epidemic(stochastic=stochastic, noisy=noisy, problem_id=problem_id) states = [] rewards = [] done = False s = env.reset() states.append(s) while not done: #s, r, done, i = env.step(action=0) # deterministic agent s, r, done, i = env.step(action=np.random.choice(env.action_space.n)) states.append(s) rewards.append(r) if stochastic: print('Stochastic=Ture, with Noisy=' + str(noisy) + ',rewards=' + str(sum(rewards))) else: print('Problem ' + str(problem_id) + ' reward sum: ' + str(sum(rewards))) #Generate pictures '''
def run_qtable(stochastic=False, noisy=False, id=0, num_episodes=20): global rw global sss rw = [] #Build Q-table table = build_q_table(ACTIONS) print("Q-Learning training") #Initial env if stochastic: env = virl.Epidemic(stochastic=True, noisy=noisy) else: env = virl.Epidemic(stochastic=stochastic, noisy=noisy, problem_id=id) rewards = [] for episode in range(num_episodes): states = [] rewards = [] done = False s = env.reset() states.append(s) while not done: a = choose_action(s, table) s_, R_, done, i = env.step(action=a) #Training qlearning_train(s, a, R_, s_, done, table) s = s_ states.append(s) rewards.append(R_) sss = states env.close()
def run_pg(stochastic=True, noisy=True, problem_id=0, episodes=20): if stochastic: env = virl.Epidemic(stochastic=True, noisy=noisy) if noisy: nl = 'nt' else: nl = 'nf' label = 'stochastic_{}'.format(nl) else: env = virl.Epidemic(stochastic=stochastic, noisy=noisy, problem_id=problem_id) if noisy: nl = 'nt' else: nl = 'nf' label = 'problem{}_{}'.format(problem_id, nl) # get the agent which is control by the policy gradient state = env.reset() agent = policyGradient(actions=env.actions, stateSize=len(state), seed=1) rewards = [] states = [] for i in range(episodes): # get the environment of task state = env.reset() while True: # RL choose action based on observation action = agent.getAction(state) # RL take action and get next observation and reward state_, reward, done, info = env.step(action) if i == episodes - 1: rewards.append(reward) states.append(state) # store the data tuple (s,a,r) and train agent.storeTransition(state, action, reward) # update the current state using observation state = state_ # task is over, and begin new one if done: break agent.train() print(sum(rewards)) # return rewards,states draw_pic(agent.lossList, 'iter', 'loss', 'PG_loss{}_{}.png'.format(episodes, label)) draw_pic(rewards, 'iter', 'reward', 'PG_reward_{}_{}.png'.format(episodes, label)) draw_state(states, 'PG_state_{}_{}.png'.format(episodes, label))
def get_all_problems_fig(): pic_dir = './results/Deterministic' for noisy in [True, False]: for act in range(4): filename = os.path.join( pic_dir, 'all_problems_noisy={} action={}.png'.format(noisy, act)) if os.path.exists(filename): continue fig, ax = plt.subplots(figsize=(8, 6)) for i in range(10): env = virl.Epidemic(problem_id=i, noisy=noisy) states = [] rewards = [] done = False s = env.reset() states.append(s) while not done: s, r, done, info = env.step( action=act) # deterministic agent states.append(s) rewards.append(r) ax.plot(np.array(states)[:, 1], label=f'problem_id={i}') ax.set_xlabel('weeks since start of epidemic') ax.set_ylabel('Number of Infectious persons') ax.set_title( 'Simulation of problem_ids with action {}'.format(act)) ax.legend() plt.savefig(dpi=300, fname=filename)
def run_qlnn(stochastic=False, noisy=False, id=0): # Setting environment parameters if stochastic: env = virl.Epidemic(stochastic=True, noisy=noisy) print('env stochastic=' + str(stochastic) + '/noisy=' + str(noisy)) else: env = virl.Epidemic(stochastic=stochastic, noisy=noisy, problem_id=id) print('env stochastic=' + str(stochastic) + '/noisy=' + str(noisy) + '/problem_id=' + str(id)) d_states = env.observation_space.shape[0] n_actions = env.action_space.n print('inital function approximator with learning rate:' + str(alpha) + ' ') # Init the two networks nn_func_approximator = NNFunctionApproximatorJointKeras( alpha, d_states, n_actions, nn_config) nn_func_approximator_target = NNFunctionApproximatorJointKeras( alpha, d_states, n_actions, nn_config) # Training print('Training>>>') stats = q_learning_nn(env, nn_func_approximator, nn_func_approximator_target, 20, max_steps_per_episode=52, epsilon_init=0.1, epsilon_decay=0.995, epsilon_min=0.001, fn_model_in=None, fn_model_out="temp.h5") print('Training done!') print('Testing>>>') nn_func_approximator.alpha = 0.0 epsilon_fixed = 0.1 stats_show = q_learning_nn(env, nn_func_approximator, nn_func_approximator_target, 1, max_steps_per_episode=52, epsilon_init=epsilon_fixed, epsilon_decay=1.0, epsilon_min=epsilon_fixed, show=True, fn_model_in="temp.h5") print('Test done!')
def get_env(): for i in range(10): for j in range(2): if j: noisy = 'True' else: noisy = 'False' problem_id = i yield problem_id, noisy, virl.Epidemic(problem_id=i, noisy=j)
def evaluate(policy, problem_id=0, full_eval=False, verbose=True, noisy=False): """ Evaluate a policy :param policy a callable that, given in input a state, returns an action :param full_eval whether to fully evaluate the policy on all the problems or the first problem only :param verbose whether to get verbose output :param noisy whether to simulate a noisy environment """ #trained_policy = create_policy(approximator_dl, 0, 4) if not full_eval: limit = 1 envs = [virl.Epidemic(problem_id=problem_id, noisy=noisy)] else: limit = 10 envs = [virl.Epidemic(problem_id=i, noisy=noisy) for i in range(limit)] fig, axes = plt.subplots(limit, 2, figsize=(20, 8 * limit)) total_rewards = [] for i, env in enumerate(envs, start=problem_id): states, rewards, action_taken = execute_policy(policy, env) if verbose: print(i, action_taken) # small hack to change the first key from i to 0 if limit == 1: axes_wrapper = [axes[0], axes[1]] else: axes_wrapper = axes[i] plot(states, rewards, action_taken, axes=axes_wrapper, problem_id=i) total_rewards.append(sum(rewards)) if limit > 1: _, ax = plt.subplots(1, 1, figsize=(10, 4)) ax.bar(np.arange(limit), total_rewards) ax.set_xticks(np.arange(limit)) return total_rewards
def train(lr=0.01, n_episodes=50): print('qlearning nn training...') env = virl.Epidemic() n_actions = env.action_space.n n_states = env.observation_space.shape[0] policy_estimator = NeuralNetwork(env, n_states, n_actions, lr=lr) stats = implement(env, policy_estimator, n_episodes, discount_factor=0.95) results_dir = './results/qlearning_nn' if not os.path.exists(results_dir): os.makedirs(results_dir) pkl_file = os.path.join( results_dir, 'qlearning_nn_lr={}_episodes={}.pkl'.format(lr, n_episodes)) with open(pkl_file, 'wb') as f: pickle.dump(policy_estimator, f) return policy_estimator
def policy_greedy(state) -> np.array: def eval_reward(state, action): policy_severity_factor = 1e11 a = state[1] + state[2] b = (1 - action) expected_a = a * (1 + action - 0.1) val = (-expected_a - expected_a**2 - policy_severity_factor * b - policy_severity_factor * b**2) / policy_severity_factor return val env = virl.Epidemic() greedy_rewards = np.array([eval_reward(state, a) for a in env.actions]) action_id = np.argmax(greedy_rewards) action_proba = [0.0] * 4 action_proba[action_id] = 1.0 return action_proba
def evaluate_stochastic(policy, num_tries=10, noisy=True): """ Evaluate a policy in a stochastic environment. horribly copied from generate_readme_plots.ipynb :param policy a callable that returns a probability distribution of probabilities :param num_tries the number of tries to perform """ fig, ax = plt.subplots(figsize=(8, 6)) for i in range(num_tries): env = virl.Epidemic(stochastic=True, noisy=noisy) states, rewards, actions_taken = execute_policy(policy, env) ax.plot(np.array(states)[:, 1], label=f'draw {i}') ax.set_xlabel('weeks since start of epidemic') ax.set_ylabel('Number of Infectious persons') ax.set_title( f'Simulation of {num_tries} stochastic episodes without intervention') ax.legend()
from IPython.core.display import display, HTML display(HTML("<style>.container { width:100% !important; }</style>")) import os os.chdir('..') from matplotlib import pyplot as plt import numpy as np import virl #set environment env = virl.Epidemic(stochastic=False, noisy=False, problem_id=(np.random.choice(10))) states = [] rewards = [] done = False s = env.reset() states.append(s) #set qlearnmethod while not done: s, r, done, i = env.step(action=np.random.choice(env.action_space.n)) states.append(s) rewards.append(r) print(rewards)
import virl import numpy as np from agents import DeterministicAgent, RandomAgent from matplotlib import pyplot as plt env = virl.Epidemic(stochastic=False, noisy=False) agent = DeterministicAgent(env) states = [] rewards = [] done = False s = env.reset() states.append(s) while not done: s, r, done, i = env.step(action=agent.get_action()) states.append(s) rewards.append(r) states = np.array(states) rewards = np.array(rewards) plt.plot(rewards) print(states[:, 0][0:4]) print(states[:, 1][0:4]) print(states[:, 2][0:4]) print(states[:, 3][0:4]) print("rewards " + str(rewards[:4]))
def QLtest(stochastic=False, noisy=False, problem_id=0, num_episodes=20): k = problem_id print(stochastic, noisy) time_start = time.time() run_qtable(stochastic=stochastic, noisy=noisy, id=problem_id, num_episodes=20) time_end = time.time() print('QL totally cost', time_end - time_start) #Test Q-learning #Set the parameters if stochastic: env = virl.Epidemic(stochastic=True, noisy=noisy) else: env = virl.Epidemic(stochastic=stochastic, noisy=noisy, problem_id=problem_id) states = [] rewards = [0] done = False s = env.reset() states.append(s) ac = [] #Call the trained Q-table if stochastic: qtable = pd.read_csv(r'Qtable_stochastic.csv') else: qtable = pd.read_csv(r'Qtable' + str(k) + '.csv') #Testing while not done: a = choose_action(s, qtable) ac.append(a) s_, R_, done, i = env.step(action=a) s = s_ states.append(s) rewards.append(R_) if stochastic: #table.to_csv(r'Qtable_stochastic.csv', index=0) print('Stochastic=Ture, with Noisy=' + str(noisy) + ',rewards=' + str(sum(rewards))) else: #table.to_csv(r'Qtable'+str(id)+'.csv',index=0) print('Problem ' + str(id) + ' reward sum: ' + str(sum(rewards))) #Generate pictures plt.figure(1) states = np.array(states) labels = ['susceptibles', 'infectious', 'quarantined', 'recovereds'] x = np.arange(0, len(states[:, 1])) for i in range(0, 4): plt.plot(x, states[:, i], label=labels[i]) path = 'QL(Noisy)_problem_' + str(k) + '.svg' plt.xlabel('Weeks') plt.ylabel('States') print(ac) plt.legend() #plt.savefig(path) plt.figure(2) plt.plot(x, rewards) plt.xlabel('Weeks( Reward sum is: ' + str(sum(rewards).astype(float)) + ' )') plt.ylabel('Reward') #plt.savefig(r'QL_reward_'+str(k)+'.svg') plt.show()
INITIAL_EPSILON = 1 # starting value of epsilon FINAL_EPSILON = 0.1 # final value of epsilon REPLAY_SIZE = 10000 # experience replay buffer size BATCH_SIZE = 32 # size of minibatch s = np.array([0., 0, 0, 0]) # epidemic state c = 1. # infection rate damping for i in range(1): try: episodes = int(sys.argv[1]) except Exception as e: episodes = int(500) # is used to train several episodes, # random.seed(1) env = virl.Epidemic(noisy=False, problem_id=i) # define env class action_dim = env.action_space.n # state_dim = env.observation_space.n agent = DQN() # agent class in inrelated to env all_reward = [] # all reward record list1 for episode in range(episodes + 1): # circle episodes times state = env.reset() # restart env agent.epsilon -= 0.7 / episodes
def update(self, s, a, td_target): """ Updates the approximator's parameters (i.e. the weights) for a given state and action towards the target y (which is the TD target). """ features = self.featurize_state(s) self.models[a].partial_fit( [features], [td_target]) # recall that we have a seperate funciton for each a from utils import (q_learning, exec_policy, get_fig, plt) if __name__ == '__main__': env = virl.Epidemic(stochastic=False, noisy=False) rbf_file = './rbf.pkl' if os.path.exists(rbf_file): with open(rbf_file, 'rb') as f: rbf_func = pickle.load(f) print('form file load RBF success.') else: rbf_func = RbfFunctionApproximator(env) # training states = q_learning(env, rbf_func, 1500, epsilon=0.05) # save the approximate function with open(rbf_file, 'wb') as f: pickle.dump(rbf_func, f) # make dir if not os.path.exists('./results/RBF'):