def double_q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): #Off Policy TD - Find Optimal Greedy policy while following epsilon-greedy policy Q_A = defaultdict(lambda: np.zeros(env.action_space.n)) Q_B = defaultdict(lambda: np.zeros(env.action_space.n)) Total_Q = defaultdict(lambda: np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # state = 0 # actions_init = 0 # Total_Q[state][actions_init] = Q_A[state][actions_init] + Q_B[state][actions_init] #choose a based on Q_A for now policy = make_epsilon_greedy_policy(Total_Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): state = env.reset() for t in itertools.count(): #choose a from policy derived from Q1 + Q2 (epsilon greedy here) action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) # with taken aciton, observe the reward and the next state next_state, reward, done, _, = env.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t #choose randomly either update A or update B #randmly generate a for being 1 or 2 random_number = random.randint(1, 2) if random_number == 1: best_action_Q_A = np.argmax(Q_A[next_state]) TD_Target_A = reward + discount_factor * Q_B[next_state][ best_action_Q_A] TD_Delta_A = TD_Target_A - Q_A[state][action] Q_A[state][action] += alpha * TD_Delta_A elif random_number == 2: best_action_Q_B = np.argmax(Q_B[next_state]) TD_Target_B = reward + discount_factor * Q_A[next_state][ best_action_Q_B] TD_Delta_B = TD_Target_B - Q_B[state][action] Q_B[state][action] += alpha * TD_Delta_B if done: break state = next_state Total_Q[state][action] = Q_A[state][action] + Q_B[state][action] return Total_Q, stats
def two_step_tree_backup(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): Q = defaultdict(lambda : np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),episode_rewards=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): print "Number of Episodes, Two Step Tree Backup", i_episode state = env.reset() #steps within each episode for t in itertools.count(): #pick the first action #choose A from S using policy derived from Q (epsilon-greedy) action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p = action_probs) #reward and next state based on the action chosen according to epislon greedy policy next_state, reward, done , _ = env.step(action) #reward by taking action under the policy pi stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p =next_action_probs ) #V = sum_a pi(a, s_{t+1})Q(s_{t+1}, a) V = np.sum(next_action_probs * Q[next_state]) next_next_state, next_reward, _, _ = env.step(next_action) next_next_action_probs = policy(next_next_state) next_next_action = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs) next_V = np.sum(next_next_action_probs * Q[next_next_state]) Delta = next_reward + discount_factor * next_V - Q[next_state][next_action] # print "Delta :", Delta # print "Next Action Prob ", np.max(next_action_probs) next_action_selection_probability = np.max(next_action_probs) td_target = reward + discount_factor * V + discount_factor * next_action_selection_probability * Delta td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state return stats
def Q_Sigma_Off_Policy(env, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1): Q = defaultdict(lambda : np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),episode_rewards=np.zeros(num_episodes)) # policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) tau = 1 tau_decay = 0.999 sigma = 1 sigma_decay = 0.995 for i_episode in range(num_episodes): print "Number of Episodes, Q(sigma) Off Policy", i_episode policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) off_policy = behaviour_policy_epsilon_greedy(Q, tau, env.action_space.n) tau = tau * tau_decay if tau < 0.0001: tau = 0.0001 state = env.reset() for t in itertools.count(): action_probs = off_policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) state_t_1, reward, done, _ = env.step(action) if done: sigma = sigma * sigma_decay if sigma < 0.0001: sigma = 0.0001 break stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # #select sigma value # probability = 0.5 # sigma_t_1 = binomial_sigma(probability) sigma_t_1 = sigma #select next action based on the behaviour policy at next state next_action_probs = off_policy(state_t_1) action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs) on_policy_next_action_probs = policy(state_t_1) on_policy_a_t_1 = np.random.choice(np.arange(len(on_policy_next_action_probs)), p = on_policy_next_action_probs) V_t_1 = np.sum( on_policy_next_action_probs * Q[state_t_1] ) Delta_t = reward + discount_factor * ( sigma_t_1 * Q[state_t_1][action_t_1] + (1 - sigma_t_1) * V_t_1 ) - Q[state][action] Q[state][action] += alpha * Delta_t state = state_t_1 return stats
def q_lambda_watkins(env, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1): Q = defaultdict(lambda: np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_error=np.zeros(num_episodes)) lambda_param = np.array( [0, 0.1, 0.15, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.975, 0.99, 1]) alpha = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1]) All_Rwd_Lambda = np.zeros(shape=(num_episodes, len(lambda_param))) All_Lambda_Alpha = np.zeros(shape=(len(lambda_param), len(alpha))) All_Error_Lambda = np.zeros(shape=(num_episodes, len(lambda_param))) All_Error_Lambda_Alpha = np.zeros(shape=(len(lambda_param), len(alpha))) num_experiments = num_episodes for l in range(len(lambda_param)): print "Lambda Param", lambda_param[l] for alpha_param in range(len(alpha)): print "Alpha Param", alpha[alpha_param] for i_episode in range(num_episodes): print "Number of Episodes, Q(lambda) Watkins", i_episode policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) state = env.reset() next_action = None action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) eligibility = defaultdict(lambda: np.zeros(env.action_space.n)) #initialising eligibility traces for t in itertools.count(): next_state, reward, done, _ = env.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t next_action_probs = policy(next_state) next_action = np.random.choice(np.arange( len(next_action_probs)), p=next_action_probs) best_action = np.argmax(Q[next_state]) Delta = reward + discount_factor * Q[next_state][ best_action] - Q[state][action] rms_error = np.sqrt( np.sum( (reward + discount_factor * V - Q[state][action])** 2) / num_experiments) stats.episode_error[i_episode] += rms_error eligibility[state][action] = eligibility[state][action] + 1 for s in range(env.observation_space.n): for a in range(env.action_space.n): Q[s][a] = Q[s][a] + alpha[ alpha_param] * Delta * eligibility[s][a] eligibility[s][a] = eligibility[s][ a] * discount_factor * lambda_param[l] if done: break action = next_action state = next_state cum_rwd_per_episode = np.array([ pd.Series(stats.episode_rewards).rolling(1, min_periods=1).mean() ]) cum_error_per_episode = np.array([ pd.Series(stats.episode_error).rolling(1, min_periods=1).mean() ]) All_Rwd_Lambda[:, l] = cum_rwd_per_episode All_Error_Lambda[:, l] = cum_error_per_episode All_Lambda_Alpha[l, alpha_param] = cum_error_per_episode.T[-1] All_Error_Lambda_Alpha[l, alpha_param] = cum_error_per_episode.T[-1] return All_Rwd_Lambda, All_Lambda_Alpha, All_Error_Lambda, All_Error_Lambda_Alpha
def two_step_q_sigma_on_policy(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.9): #Expected SARSA : same algorithm steps as Q-Learning, # only difference : instead of maximum over next state and action pairs # use the expected value Q = defaultdict(lambda: np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): state = env.reset() action_probs = policy(state) #choose a from policy derived from Q (which is epsilon-greedy) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) #steps within each episode for t in itertools.count(): sigma = random.randint(0, 1) #if using a random number for sigma #sigma = np.random.rand(1) next_state, reward, done, _ = env.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) V = np.sum(next_action_probs * Q[next_state]) One_Sigma_Effect = sigma * Q[next_state][next_action] + (1 - sigma) * V One_Step = reward + discount_factor * One_Sigma_Effect next_action_selection_probability = np.max(next_action_probs) Two_Step = -discount_factor * ( 1 - sigma ) * next_action_selection_probability * Q[next_state][next_action] next_next_state, next_reward, _, _ = env.step(next_action) next_next_action_probs = policy(next_next_state) next_next_action = np.random.choice(np.arange( len(next_next_action_probs)), p=next_next_action_probs) V_next = np.sum(next_next_action_probs * Q[next_next_state]) Three_Sigma_Effect = sigma * Q[next_next_state][ next_next_action] + (1 - sigma) * V_next Int_Three_Step = next_reward + discount_factor * Three_Sigma_Effect Three_Step = discount_factor * ( 1 - sigma) * next_action_selection_probability * Int_Three_Step Fourth_Step = -discount_factor * sigma * Q[next_state][next_action] Fifth_Sigma_Effect = sigma * Q[next_next_state][ next_next_action] + (1 - sigma) * V_next Int_Fifth_Step = discount_factor * Fifth_Sigma_Effect Int_Int_Fifth_Step = next_reward + Int_Fifth_Step Fifth_Step = discount_factor * sigma * Int_Int_Fifth_Step td_target = One_Step + Two_Step + Three_Step + Fourth_Step + Fifth_Step td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break action = next_action state = next_state return Q, stats
def deep_q_learning(sess, env, q_estimator, target_estimator, num_episodes, experiment_dir, replay_memory_size=500, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") # checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") macro_dir = os.path.join(experiment_dir, "macro") macro_path = os.path.join(macro_dir, "macro.txt") worker_dir = os.path.abspath("./{}/{}".format(WORKER_SAVE_DIR, env.spec.id)) output_path = os.path.join(worker_dir, "output_score") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) if not os.path.exists(macro_dir): os.makedirs(macro_dir) # Saver to save/restore weights saver = tf.train.Saver() ckpt_idx, ckpt = get_latest_ckpt(os.listdir(checkpoint_dir)) if ckpt is not None: os.system('cp -r {} ./tmp/'.format(checkpoint_dir)) ckpt = os.path.join('./tmp/checkpoints', ckpt) saver.restore(sess, ckpt) print('Restore model_{}.ckpt'.format(ckpt_idx)) # The replay memory replay_memory = [] total_t = sess.run( tf.contrib.framework.get_global_step()) # total_t = 0, for q value tf_update_idx = 0 # for plotting graph (reward and epsilon) # Restore replay memory if exists if os.path.isfile(macro_path): macro_log = load_log( macro_path ) # [[skills, reward, epsilon], [skills, reward, epsilon], ...] print("Reading macro.txt") print("Resume training from {} skills...".format(len(macro_log))) for log_skills in macro_log: replay_memory = macro_log_to_replay_memory(replay_memory, log_skills, VALID_ACTIONS, REWARD_FACTOR) tf_update_idx += 1 # for plotting assert tf_update_idx >= ckpt_idx, 'Unexpected checkpoint. Checkpoint version is higher than the replay memory.' # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, NUM_EXPLORE_SKILL) _ = np.full(shape=(MAX_NUM_TOTAL_SKILL - NUM_EXPLORE_SKILL, ), fill_value=epsilon_end) epsilons = np.concatenate((epsilons, _), axis=0) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) state = env.reset() strat_time = time.time() # populating initial skills, random generate skill while tf_update_idx < NUM_INIT_SKILL: # Generate action sequence print("{}/{} skills @ Init replay memory".format( tf_update_idx + 1, NUM_INIT_SKILL)) actions = [] state = env.reset() epsilon = epsilons[0] # 1.0 while True: action_probs = policy( sess, state, epsilon) # epsilon = 1.0, totally random search action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, _, done, _ = env.step(VALID_ACTIONS[action]) # collect actions sequence actions.append(VALID_ACTIONS[action]) if done: break else: state = next_state # map action to workers print("Waiting for mapping...") env.map_single_reward( actions, epsilon) # return when actions is sent to a worker # check if any macro ensemble be calculated. while tf_update_idx < NUM_INIT_SKILL: skill, reward, epsilon = read_score( output_path) # Reutrn [None, None] if nothing new if skill is not None: factor_episode_reward = reward * REWARD_FACTOR # Write log into tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=factor_episode_reward, node_name="factor_episode_reward", tag="factor_episode_reward") episode_summary.value.add(simple_value=reward, node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=epsilon, node_name="epsilon", tag="epsilon") episode_summary.value.add(simple_value=LEN_SKILL, node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, tf_update_idx) q_estimator.summary_writer.flush() tf_update_idx += 1 write_reward_to_log(macro_path, skill, reward, epsilon, NUM_SKILL=NUM_SKILL, MAXLEN_PER_SKILL=MAXLEN_PER_SKILL) # add the new macro ensemble into the replay buffer skill = np.array(skill).reshape(-1).tolist() state = env.reset() for a in skill: next_state, _, done, _ = env.step( a) # a has been mapped to VALID_ACTIONS # Add data to replay memory if done: replay_memory.append( Transition(state, VALID_ACTIONS.index(a), factor_episode_reward, next_state, done)) else: replay_memory.append( Transition(state, VALID_ACTIONS.index(a), 0, next_state, done)) state = next_state else: break # train model (Number of loaded macro > NUM_INIT_MODEL) if tf_update_idx > NUM_INIT_SKILL: for i in range(tf_update_idx - NUM_INIT_SKILL): partial_memory = replay_memory[:LEN_SKILL * (NUM_INIT_SKILL + i)] # Train agent if NUM_INIT_SKILL + i > ckpt_idx: q_estimator, target_estimator = train_agent( sess=sess, replay_memory=partial_memory, q_estimator=q_estimator, target_estimator=target_estimator, NUM_EPOCH=NUM_EPOCH, discount_factor=discount_factor, BATCH_SIZE=BATCH_SIZE, NUM_TOTAL_SKILL=MAX_NUM_TOTAL_SKILL, update_target_estimator_every=update_target_estimator_every, tf_update_idx=NUM_INIT_SKILL + i, # current skill id ) # Main training loop skill = [] # Data sent back from worker. Initialize skill to null. while tf_update_idx < MAX_NUM_TOTAL_SKILL: if time.time() - strat_time > MAX_LIMIT_TIME: break # Save model if tf_update_idx % 50 == 0: ckpt = "model_{}.ckpt".format(tf_update_idx) print('Saving model_{}.ckpt'.format(tf_update_idx)) saver.save(sess, os.path.join(checkpoint_dir, ckpt)) # If our replay memory is full, pop the first element while len(replay_memory) > replay_memory_size: replay_memory.pop(0) if skill is not None: # Train agent with if tf_update_idx > ckpt_idx: q_estimator, target_estimator = train_agent( sess=sess, replay_memory=replay_memory, q_estimator=q_estimator, target_estimator=target_estimator, NUM_EPOCH=NUM_EPOCH, discount_factor=discount_factor, BATCH_SIZE=BATCH_SIZE, NUM_TOTAL_SKILL=MAX_NUM_TOTAL_SKILL, update_target_estimator_every=update_target_estimator_every, tf_update_idx=tf_update_idx, ) # populate a new macro ensemble if skill is None: print("{}/{} th skills".format(tf_update_idx + 1, MAX_NUM_TOTAL_SKILL)) actions = [] epsilon = epsilons[ tf_update_idx - NUM_INIT_SKILL] # Epsilon for this macro ensemble state = env.reset() while True: action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, _, done, _ = env.step(VALID_ACTIONS[action]) # collect action sequence actions.append(VALID_ACTIONS[action]) if done: break # if args.dup: # duplicated macro action in one ensemble is valid. # break # else: # add penalty to duplicated macro action # replay_memory, states, next_states, actions, dones, is_dup = check_duplicated_macro_action( # replay_memory=replay_memory, # states=states, next_states=next_states, # actions=actions, dones=dones) # if is_dup: # continue # else: # break else: state = next_state # map action to workers print("Waiting for mapping...") env.map_single_reward( actions, epsilon) # return when actions is sent to a worker # check if any macro ensemble be calculated. skill, reward, epsilon = read_score( output_path) # Reutrn [None, None] if nothing new if skill is not None: factor_episode_reward = reward * REWARD_FACTOR # Write log into tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=factor_episode_reward, node_name="factor_episode_reward", tag="factor_episode_reward") episode_summary.value.add(simple_value=reward, node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=epsilon, node_name="epsilon", tag="epsilon") episode_summary.value.add(simple_value=LEN_SKILL, node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, tf_update_idx) q_estimator.summary_writer.flush() tf_update_idx += 1 write_reward_to_log(macro_path, skill, reward, epsilon, NUM_SKILL=NUM_SKILL, MAXLEN_PER_SKILL=MAXLEN_PER_SKILL) # add the new macro ensemble into the replay buffer skill = np.array(skill).reshape(-1).tolist() state = env.reset() for a in skill: next_state, _, done, _ = env.step( a) # a has been mapped to VALID_ACTIONS # Add data to replay memory if done: replay_memory.append( Transition(state, VALID_ACTIONS.index(a), factor_episode_reward, next_state, done)) else: replay_memory.append( Transition(state, VALID_ACTIONS.index(a), 0, next_state, done)) state = next_state READ_REWARD_AGAIN = True env.close() return stats
def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): """ Q-Learning algorithm for fff-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: env: OpenAI environment. estimator: Action-Value function estimator num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. epsilon_decay: Each episode, epsilon is decayed by this factor Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): # The policy we're following policy = make_epsilon_greedy_policy(estimator, epsilon * epsilon_decay**i_episode, env.action_space.n) # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats.episode_rewards[i_episode - 1] sys.stdout.flush() # Reset the environment and pick the first action state = env.reset() # Only used for SARSA, not Q-Learning next_action = None # One step in the environment for t in itertools.count(): # Choose an action to take # If we're using SARSA we already decided in the previous step if next_action is None: action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action = next_action # Take a step next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # TD Update q_values_next = estimator.predict(next_state) # Use this code for Q-Learning # Q-Value TD Target td_target = reward + discount_factor * np.max(q_values_next) # Use this code for SARSA TD Target for on policy-training: # next_action_probs = policy(next_state) # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) # td_target = reward + discount_factor * q_values_next[next_action] # Update the function approximator using our target estimator.update(state, action, td_target) print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, last_reward), end="") if done: break state = next_state return stats
def actor_critic(env, estimator_policy_X, estimator_value_X, trainer_X, num_episodes, discount_factor=1.0, player2=True, positiveRewardFactor=1.0, negativeRewardFactor=1.0, batch_size=1): """ Actor Critic Algorithm. Optimizes the policy function approximator using policy gradient. Args: env: OpenAI environment. estimator_policy_X: Policy Function to be optimized estimator_value_X: Value function approximator, used as a critic trainer_X: our training class num_episodes: Number of episodes to run for discount_factor: Time-discount factor player2: True if computer plays player2, False if user does positiveRewardFactor: Factor bla bla bla reward negativeRewardFactor: Factor bla bla bla batch_size: Batch size Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_td_error=np.zeros(num_episodes), episode_value_loss=np.zeros(num_episodes), episode_policy_loss=np.zeros(num_episodes), episode_kl_divergence=np.zeros(num_episodes)) Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) batch_board_X = np.zeros((batch_size, 7, 6, 2)) batch_player_X = np.zeros((batch_size, 2)) batch_td_target_X = np.zeros((batch_size, 1)) batch_td_error_X =np.zeros((batch_size, 1)) batch_action_X =np.zeros((batch_size, 1)) batch_avaliableColumns_X = np.zeros((batch_size, 7)) batch_pos_X = 0 game = 1 for i_episode in range(num_episodes): # Reset the environment and pick the first action state = env.reset(i_episode % 2 + 1) robotLevel = i_episode%4 + 1 episode = [] probas = None last_turn = False done = False last_state = None action = None reward = None # if game % 5000 == 10: # player2 = True # elif game % 5000 == 0: # player2 = False if game == num_episodes-3: player2 = False # One step in the environment for t in itertools.count(): # Save avaliable columns if not done: avaliableColumns = env.getAvaliableColumns() currentPlayerBeforeStep = env.getCurrentPlayer() action_tmp = action # Take a step if currentPlayerBeforeStep == 1 and not done or currentPlayerBeforeStep == 2 and player2 and not done: action, probas = estimator_policy_X.predict(env) action = action[0] probas = probas[0] elif not done: try: action = int(input("Give a column number: ")) - 1 except ValueError: print("Wrong input! Setting action to 1") action = 0 probas = None if currentPlayerBeforeStep == 2 and player2 and not done: next_state, reward, step_done, action = env.robotStep(robotLevel) elif not done: next_state, reward, step_done, _ = env.step(action) if not done: if game == num_episodes-3: pass #layer1, layer2 = trainer_X.evalFilters(next_state[1]) #plotting.plotNNFilter(next_state[1], layer1, layer2) if step_done: pass if t > 0: state_tmp = last_state last_state = state reward_tmp = -reward*negativeRewardFactor else: state_tmp = state last_state = state reward_tmp = -reward*negativeRewardFactor elif done and not last_turn: state_tmp = episode[-2].next_state reward_tmp = reward*positiveRewardFactor else: break if t > 0: episode.append(Transition( state=state_tmp, action=action_tmp, reward=reward_tmp, next_state=next_state, done=done)) player = None if episode[-1].state[0][0] == 1: player = "X" elif episode[-1].state[0][1] == 1: player = "O" # Update statistics stats.episode_lengths[i_episode] = t # If player 0 (X) if episode[-1].state[0][0] == 1 or True: if episode[-1].state[0][0] == 1: stats.episode_rewards[i_episode] += episode[-1].reward # Calculate TD Target value_next = estimator_value_X.predict(episode[-1].next_state) td_target = episode[-1].reward + discount_factor * value_next td_error = td_target - estimator_value_X.predict(episode[-1].state) if episode[-1].state[0][0] == 1: batch_board_X[batch_pos_X] = episode[-1].state[1] else: batch_board_X[batch_pos_X] = invertBoard(episode[-1].state[1]) batch_player_X[batch_pos_X] = episode[-1].state[0] batch_td_target_X[batch_pos_X] = td_target batch_td_error_X[batch_pos_X] = td_error batch_action_X[batch_pos_X] = episode[-1].action batch_avaliableColumns_X[batch_pos_X] = avaliableColumns batch_pos_X += 1 # else: # value_next = estimator_value_O.predict(episode[-1].next_state, ) # td_target = episode[-1].reward + discount_factor * value_next # td_error = td_target - estimator_value_O.predict(episode[-1].state) # # batch_player_O[batch_pos_O] = episode[-1].state[0] # batch_board_O[batch_pos_O] = episode[-1].state[1] # batch_td_target_O[batch_pos_O] = td_target # batch_td_error_O[batch_pos_O] = td_error # batch_action_O[batch_pos_O] = episode[-1].action # batch_avaliableColumns_O[batch_pos_O] = avaliableColumns # # batch_pos_O += 1 stats.episode_td_error[i_episode] += td_error if batch_pos_X == batch_size: # Update both networks loss_X, policyLoss, valueLoss = trainer_X.update(batch_board_X, batch_td_target_X, batch_td_error_X, batch_action_X, batch_avaliableColumns_X) loss_X = loss_X[0][0] policyLoss = policyLoss[0][0] valueLoss = valueLoss[0][0] batch_pos_X = 0 print("Updates X network. Loss:", loss_X) stats.episode_value_loss[i_episode] += valueLoss # if batch_pos_O == batch_size: # # Update both networks # loss_O = trainer_O.update(batch_board_O, batch_td_target_O, batch_td_error_O, batch_action_O, # batch_avaliableColumns_O) # loss_O = loss_O[0][0] # batch_pos_O = 0 # # print("Updates X network. Loss:", loss_O) # stats.episode_value_loss[i_episode] += loss_O if probas is not None and last_probas is not None: kl_div = 0 for i in range(probas.size): kl_div += probas[i]*np.log(probas[i]/last_probas[i]) stats.episode_kl_divergence[i_episode] += kl_div # Print out which step we're on, useful for debugging. print( "\rPlayer {}: Action {}, Reward {:<4}, TD Error {:<20}, TD Target {:<20}, Value Next {:<20}, at Step {:<5} @ Game {} @ Episode {}/{} ({})".format( player, int(episode[-1].action + 1), episode[-1].reward, td_error, td_target, value_next, t, game, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="") if player == "X" and episode[-1].reward > 0 and robotLevel > 1:# or i_episode % 100 == 0: for i in range(t): print("Player:", batch_player_X[batch_pos_X-t+i], "Action:", int(batch_action_X[batch_pos_X-t+i])+1 ) print("Robot level:", robotLevel) env.renderHotEncodedState( ((1, 0), batch_board_X[batch_pos_X-1]) ) if game == num_episodes or env.getCurrentPlayer() == 2 and not player2: env.render() if probas is not None: out = " " for i in range(probas.size): out += "%03d " % int(probas[i]*100+0.5) print(out) last_probas = probas if done: last_turn = True game += 1 if step_done: done = True state = next_state return stats
def main(): global nTrucks global state global num_of_load global num_of_dump global num_of_return BucketA_capacity = 1.5 BucketB_capacity = 1.0 Truck1_capacity = 6 Truck2_capacity = 3 Truck1_speed = 15.0 Truck2_speed = 20.0 Truck1_speedRatio = Truck1_speed / (Truck1_speed + Truck2_speed) Truck2_speedRatio = Truck2_speed / (Truck1_speed + Truck2_speed) i_episode = 0 # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_loss=np.zeros(num_episodes)) #seed - TODO seed() np.random.seed(0) #initialize master and workers tf.reset_default_graph() with tf.device("/cpu:0"): trainer_critic = tf.train.AdamOptimizer(learning_rate=alpha_critic) trainer_actor = tf.train.AdamOptimizer(learning_rate=alpha_actor) master_network = AC_Network(nS, nA, 'global', None, None) #num_threads = multiprocessing.cpu_count() #TODO does each thread run on a different cpu core ? num_threads = 2 workers = [] #create workers for i in range(num_threads): workers.append(Worker(i, nS, nA, trainer_critic, trainer_actor)) #set up session with tf.Session() as sess: coord = tf.train.Coordinator() sess.run(tf.global_variables_initializer()) #start episodes while (i_episode + num_threads) < num_episodes: #reset vars num_of_load = np.zeros(num_episodes) num_of_dump = np.zeros(num_episodes) num_of_return = np.zeros(num_episodes) state[i_episode] = np.zeros(nS) old_state = np.zeros((nTrucks,nS)) old_time = np.zeros(nTrucks) old_action = np.zeros(nTrucks).astype(int) Iterations = 0 #number of decision iterations in an episode Mean_TD_Error = 0 #initialize environment threads env_threads = [] for worker in workers: run_sim_args = [nTrucks, BucketA_capacity, BucketB_capacity, Truck1_capacity, Truck2_capacity, Truck1_speedRatio, Truck2_speedRatio, worker, old_state, old_time, old_action, Iterations, Mean_TD_Error, i_episode, coord, sess] # Print num of episode print "\rEpisode: ", i_episode + 1, " / ", num_episodes i_episode += 1 t = threading.Thread(target=run_sim, args=run_sim_args) t.start() env_threads.append(t) #time.sleep(1) coord.join(env_threads) for i in range(num_episodes): if i >= i_episode: stats.episode_lengths[i] = Hrs[i_episode-1] stats.episode_rewards[i] = ProdRate[i_episode-1] stats.episode_loss[i] = Mean_Loss[i_episode-1] #print "Mean_Loss[%d] = %r \t eploss[%d] = %r" % (i_episode-1, Mean_Loss[i_episode-1], i, stats.episode_loss[i]) else: stats.episode_lengths[i] = Hrs[i] stats.episode_rewards[i] = ProdRate[i] stats.episode_loss[i] = Mean_Loss[i] #print "Mean_Loss[%d] = %r \t eploss[%d] = %r" % (i, Mean_Loss[i], i, stats.episode_loss[i]) plotting.plot_episode_stats(stats, name='A3C' smoothing_window=20)
def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.8): #, epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. alpha: TD learning rate. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: A tuple (Q, episode_lengths). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) memory = defaultdict(list) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # The policy we're following #policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. #print("Episode ", i_episode) if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") #sys.stdout.flush() # Reset the environment and pick the first action eigenState = env.reset() # One step in the environment # total_reward = 0.0 for t in itertools.count(): if eigenState in memory: memList = memory[eigenState] action = memList[0] stateValue = memList[1] nextState = memList[2] if nextState in memory: nextStateValue = memory[nextState][1] else: nextStateValue = 0.0 reward = memList[3] Q_program = QuantumProgram() qr = Q_program.create_quantum_register("qr", 2) cr = Q_program.create_classical_register("cr", 2) eigenAction = Q_program.create_circuit("superposition", [qr], [cr]) eigenAction.h(qr) eigenAction, qr = groverIteration(Q_program, eigenAction, qr, action, reward, nextStateValue) else: #################### Prepare the n-qubit registers ######################################### Q_program = QuantumProgram() qr = Q_program.create_quantum_register("qr", 2) cr = Q_program.create_classical_register("cr", 2) eigenAction = Q_program.create_circuit("superposition", [qr], [cr]) eigenAction.h(qr) ############################################################################################ stateValue = 0.0 action = collapseActionSelectionMethod(Q_program, eigenAction, qr, cr) nextEigenState, reward, done = env.step(action) if nextEigenState in memory: memList = memory[nextEigenState] nextStateValue = memList[1] else: nextStateValue = 0.0 #Update state value stateValue = stateValue + alpha * ( reward + (discount_factor * nextStateValue) - stateValue) #print(stateValue) memory[eigenState] = (action, stateValue, nextEigenState, reward) stats.episode_rewards[i_episode] += (discount_factor**t) * reward stats.episode_lengths[i_episode] = t if done: break #state = next_state eigenState = nextEigenState return Q, stats, memory
def q_learning(env, num_episodes, discount_factor=1, alpha=0.5, epsilon=0.1): """ Args: alpha: TD learning rate """ # height = env.unwrapped.game.height width = env.unwrapped.game.width Q = defaultdict(lambda: np.zeros(ACTION_SPACE)) # Q = defaultdict(lambda: np.random.rand(ACTION_SPACE)) # Q = defaultdict(lambda: np.ones(ACTION_SPACE)) goal_int = helper.convert_state(16, 1, width) for i in range(3): Q[goal_int][i] = 0 stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_runtime=np.zeros(num_episodes)) stats_test = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_runtime=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(Q, epsilon, ACTION_SPACE) for i_episode in range(num_episodes): print("------------------------------") start_total_runtime = time.time() # Reset the env and pick the first action previous_state = env.reset() state_int = helper.convert_state(previous_state[1], previous_state[0], width) for t in range(cf.TIME_RANGE): env.render() # time.sleep(0.1) # Take a step action_probs = policy(state_int, i_episode) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) # action = env.action_space.sample() if(action == 4): import ipdb; ipdb.set_trace() # print("---------------------------------") # 0: UP # 1: DOWN # 2: LEFT # 3: RIGHT next_state, reward, done, _ = env.step(action) if done: reward = 10 else: reward = reward - 1 previous_state = next_state next_state_int = helper.convert_state(next_state[1], next_state[0], width) # Update stats stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # TD Update best_next_action = np.argmax(Q[next_state_int]) td_target = reward + discount_factor*Q[next_state_int][best_next_action] td_delta = td_target - Q[state_int][action] Q[state_int][action] += alpha * td_delta if done: # import ipdb; ipdb.set_trace() break previous_state = next_state state_int = next_state_int stats.episode_runtime[i_episode] += (time.time()-start_total_runtime) run_experiment(env, Q, stats_test, i_episode, width, cf.TIME_RANGE) return Q, stats, stats_test
def two_step_tree_backup(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(estimator, epsilon, env.action_space.n) for i_episode in range(num_episodes): state = env.reset() #steps within each episode for t in itertools.count(): #pick the first action #choose A from S using policy derived from Q (epsilon-greedy) action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) #reward and next state based on the action chosen according to epislon greedy policy next_state, reward, done, _ = env.step(action) if done: print('Episode is ', i_episode) break #reward by taking action under the policy pi stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) #V = sum_a pi(a, s_{t+1})Q(s_{t+1}, a) V = np.sum(next_action_probs * estimator.predict(next_state)) next_next_state, next_reward, _, _ = env.step(next_action) next_next_action_probs = policy(next_next_state) next_next_action = np.random.choice(np.arange( len(next_next_action_probs)), p=next_next_action_probs) next_V = np.sum(next_next_action_probs * estimator.predict(next_next_state)) # print "Next Action:", next_action # print "Next Action probs :", next_action_probs #Main Update Equations for Two Step Tree Backup Q_next_state_next_action = estimator.predict(next_state) Q_next_state_next_action = Q_next_state_next_action[next_action] Delta = next_reward + discount_factor * next_V - Q_next_state_next_action # print "Delta :", Delta # print "Next Action Prob ", np.max(next_action_probs) next_action_selection_probability = np.max(next_action_probs) td_target = reward + discount_factor * V + discount_factor * next_action_selection_probability * Delta estimator.update(state, action, td_target) state = next_state return stats
def main(): global num_of_load global num_of_dump global num_of_return global state global old_state global old_time global Mean_TD_Error global Iterations global nTrucks global num_decisions global num_decisions_A global num_decisions_B global local_decisions_A global local_decisions_B global idle_count_A global idle_count_B global actions_performed_without_maintenance_A global actions_performed_without_maintenance_B global repair_downtime_remaining_A global repair_downtime_remaining_B global both_excavators_failed_flag global g_Truck1_capacity global g_Truck2_capacity BucketA_capacity = 6 BucketB_capacity = 3 Truck1_capacity = g_Truck1_capacity Truck2_capacity = g_Truck2_capacity Truck1_speed = 20 Truck2_speed = 20 Truck1_speedRatio = Truck1_speed / float(Truck1_speed + Truck2_speed) Truck2_speedRatio = Truck2_speed / float(Truck1_speed + Truck2_speed) #run session (initialise tf global vars) sess.run(init) num_episodes = 10000 # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_loss=np.zeros(num_episodes), episode_decisions_A=np.zeros(num_episodes), episode_decisions_B=np.zeros(num_episodes), lastep_decisions_A=[], lastep_decisions_B=[]) for i_episode in range(num_episodes): #reset global vars num_of_load = 0 num_of_dump = 0 num_of_return = 0 state = np.zeros(12) old_state = np.zeros((nTrucks, 12)) old_time = np.zeros(nTrucks) Mean_TD_Error = 0 Iterations = 0 num_decisions = 0 num_decisions_A = 0 num_decisions_B = 0 idle_count_A = 0 idle_count_B = 0 actions_performed_without_maintenance_A = 0 actions_performed_without_maintenance_B = 0 repair_downtime_remaining_A = 0 repair_downtime_remaining_B = 0 both_excavators_failed_flag = False # Print out which episode we're on, useful for debugging. print "\rEpisode: ", i_episode + 1, " / ", num_episodes #run simulation run_sim(nTrucks, BucketA_capacity, BucketB_capacity, Truck1_capacity, Truck2_capacity, Truck1_speedRatio, Truck2_speedRatio) stats.episode_lengths[i_episode] = Hrs[i_episode] stats.episode_rewards[i_episode] = ProdRate[i_episode] stats.episode_loss[i_episode] = abs(Mean_TD_Error) stats.episode_decisions_A[i_episode] = num_decisions_A stats.episode_decisions_B[i_episode] = num_decisions_B stats.lastep_decisions_A.extend(local_decisions_A) stats.lastep_decisions_B.extend(local_decisions_B) # print "local_decisions_A: ", local_decisions_A # print "stats.lastep_decisions_A: ", stats.lastep_decisions_A # print stats.lastep_decisions_A == local_decisions_A #plotting.plot_episode_stats(stats, name='Qlearning_20', smoothing_window=20) plotting.plot_episode_stats(stats, name='Qlearning_20_linear', smoothing_window=20)
def Q_Sigma_On_Policy_Epsilon_Dependent(env, theta, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=0.999): stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) cumulative_errors = np.zeros(shape=(num_episodes, 1)) alpha = 0.1 tau = 1 for i_episode in range(num_episodes): print("Epsisode Number On Policy Q(sigma) Epsilon_Sigma", i_episode) epsilon_sigma = epsilon * epsilon_decay**i_episode if epsilon_sigma <= 0.0001: epsilon_sigma = 0.0001 #off_policy = behaviour_policy_Boltzmann(theta, tau, env.action_space.n) policy = make_epsilon_greedy_policy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n) state = env.reset() next_action = None for t in itertools.count(): if next_action is None: action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action = next_action state_t_1, reward, done, _ = env.step(action) if done: break stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # q_values = estimator.predict(state) # q_values_state_action = q_values[action] #evaluate Q(current state, current action) features_state = featurize_state(state) q_values = np.dot(theta.T, features_state) q_values_state_action = q_values[action] if np.random.rand() < epsilon_sigma: sigma_t_1 = 0 else: sigma_t_1 = 1 #select next action based on the behaviour policy at next state next_action_probs = policy(state_t_1) action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) # q_values_t_1 = estimator.predict(state_t_1) # q_values_next_state_next_action = q_values_t_1[action_t_1] features_state_1 = featurize_state(state_t_1) q_values_t_1 = np.dot(theta.T, features_state_1) q_values_next_state_next_action = q_values_t_1[action_t_1] on_policy_next_action_probs = policy(state_t_1) on_policy_a_t_1 = np.random.choice(np.arange( len(on_policy_next_action_probs)), p=on_policy_next_action_probs) V_t_1 = np.sum(on_policy_next_action_probs * q_values_t_1) Delta_t = reward + discount_factor * ( sigma_t_1 * q_values_next_state_next_action + (1 - sigma_t_1) * V_t_1) - q_values_state_action """ target for one step 1 step TD Target --- G_t(1) """ td_target = q_values_state_action + Delta_t td_error = td_target - q_values_state_action # estimator.update(state, action, new_td_target) theta[:, action] += alpha * td_error * features_state #rms_error = np.sqrt(np.sum((td_error)**2)) #cumulative_errors[i_episode, :] += rms_error state = state_t_1 return stats #,cumulative_errors
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. alpha: TD learning rate. epsilon: Chance to sample a random action. Float between 0 and 1. Returns: A tuple (Q, episode_lengths). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.nA)) # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # The policy we're following policy = make_epsilon_greedy_policy(Q, epsilon, env.nA) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() # Implement this! # Reset the environment and pick the first action #YOU HAVE TO REWRITE THIS, OTHERWISE IT WILL GET RID OF ALL STUDENTS state = env.reset() # One step in the environment # total_reward = 0.0 for t in itertools.count(): # Take a step action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) #print("----------") #print(action, "action") #print(reward, "reward") #print(next_state, "next_state") # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # TD Update best_next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state #print(state) return Q, stats
def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.9, epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. alpha: TD learning rate. epsilon: Chance to sample a random action. Float between 0 and 1. Returns: A tuple (Q, episode_lengths). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). step_number = 20 Q = defaultdict(lambda: np.ones(env.action_space.n)) exprep = ReplayMemory(1) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # The policy we're following policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): print('episode no.', i_episode) # Reset the environment and pick the first action state = env.reset() # One step in the environment # total_reward = 0.0 for t in itertools.count(): # Take a step action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) tup = env.step(action) exprep.push(tup) # Update statistics stats.episode_rewards[i_episode] += tup[3] stats.episode_lengths[i_episode] = t if exprep.isReady(): B = exprep.sampleBatch(1) for j in B: # TD Update print('Q for ', j[0], 'before update', Q[tuple(j[0])]) prev_state, action, next_state, reward, done = j[0], j[ 1], j[2], j[3], j[4] best_next_action = np.argmax(Q[tuple(next_state)]) td_target = reward + discount_factor * Q[ tuple(next_state )][best_next_action] if not done else reward td_delta = td_target - Q[tuple(prev_state)][action] Q[tuple(prev_state)][action] += alpha * td_delta print('Q for ', j[0], 'after update', Q[tuple(j[0])]) #print('Q',Q[(1,10,1)]) #alpha= alpha**t if tup[-1]: #or t==step_number: if tup[-1]: print('found the solution', tup[2], 'prev', tup[0]) # z=input() #print (Q) break state = tup[2] return Q, stats
def q_learning(env, q_estimator, target_estimator, num_episodes, update_target_estimator_every, discount_factor=1.0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: env: OpenAI environment. estimator: Action-Value function estimator num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. epsilon_decay: Each episode, epsilon is decayed by this factor Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ batch_size = 32 epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) epsilon = epsilons[0] Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) # Create a replay memory buffer. replay_memory = deque(maxlen=10000) # Fill replay buffer. state = env.reset() for i in itertools.count(): if i % 100 == 0: print("Filling replay buffer... " + str(i) + "/" + str(replay_memory.maxlen), end="\r") action = env.action_space.sample() next_state, reward, done, info = env.step(action) # Record the transition. replay_memory.append( Transition(state, action, reward, next_state, done)) state = next_state if done: state = env.reset() if i >= replay_memory.maxlen: break # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) total_t = 0 for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats.episode_rewards[i_episode - 1] print("\rEpisode={}/{}\tTotal timesteps={}\tReward={}\tEpsilon={}". format(i_episode + 1, num_episodes, total_t, last_reward, epsilon), end="") sys.stdout.flush() # Run an episode. state = env.reset() for t in itertools.count(): # Copy target network. if total_t % update_target_estimator_every == 0: target_estimator.copy_params() # print("\nCopied model parameters to target network.") # Take action. epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] action_probs = np.ones(env.action_space.n, dtype=float) * epsilon / env.action_space.n q_values = q_estimator.predict(state) best_action = np.argmax(q_values) action_probs[best_action] += (1.0 - epsilon) action = np.random.choice(env.action_space.n, p=action_probs) next_state, reward, done, info = env.step(action) # Record the transition. replay_memory.append( Transition(state, action, reward, next_state, done)) # Record stats. stats.episode_lengths[i_episode] = t stats.episode_rewards[i_episode] += reward # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # Calculate q values and targets q_values_next = target_estimator.predict_batch(next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * discount_factor * np.amax(q_values_next, axis=1) # Update Q function. states_batch = np.array(states_batch) q_estimator.update(states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 yield i_episode, total_t, stats return stats
def q_learning(env, num_episodes, discount_factor=1, alpha=0.5, epsilon=0.1, epsilon_decay=1.0): """ Args: alpha: TD learning rate """ height = env.unwrapped.game.height width = env.unwrapped.game.width stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # 4 actions + 2 for X and Y weights = np.random.rand(6) for i_episode in range(num_episodes): print("------------------------------") # The policy we're following # policy = make_epsilon_greedy_policy( # epsilon * epsilon_decay**i_episode, env.action_space.n) # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats.episode_rewards[i_episode - 1] sys.stdout.flush() # Reset the env and pick the first action previous_state = env.reset() action_probs = np.ones(4, dtype=float) for t in range(TIME_RANGE): env.render() # time.sleep(0.1) # Take a step # action_probs = policy(state_int, i_episode) normalised_x = int(previous_state[0])/int(width) normalised_y = int(previous_state[1])/int(height) for i in range(0,4): action_probs[i] = weights[i] + normalised_x*weights[4] + normalised_y*weights[5] # action_probs[i] = weights[i] + int(previous_state[0])*weights[4] + int(previous_state[1])*weights[5] action = np.argmax(action_probs) print("action ", action) # import ipdb; ipdb.set_trace() # action = np.random.choice(np.arange(len(action_probs)), p=action_probs) # action = env.action_space.sample() # 0: UP # 1: DOWN # 2: LEFT # 3: RIGHT next_state, reward, done, _ = env.step(action) if done: reward = 100 else: reward = reward - 1 # Update stats stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # TD Update alpha = 0.01 # v_now = weights[action] + int(previous_state[0])*weights[4] + int(previous_state[1])*weights[5] v_now = weights[action] + normalised_x*weights[4] + normalised_y*weights[5] normalised_next_x = int(next_state[0])/int(width) normalised_next_y = int(next_state[1])/int(height) # v_next = weights[action] + int(next_state[0])*weights[4] + int(next_state[1])*weights[5] v_next = weights[action] + normalised_next_x*weights[4] + normalised_next_y*weights[5] weights_delta = alpha*(reward + discount_factor*v_next - v_now)*weights print("weights_delta", weights_delta) weights = weights - weights_delta print("weights", weights) previous_state = next_state if done: break # run_experiment(env,state_int, Q, stats_test, i_episode, width, TIME_RANGE) return Q, stats
def Q_Sigma_Off_Policy_3_Step(env, theta, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): #q-learning algorithm with linear function approximation here #estimator : Estimator of Q^w(s,a) - function approximator stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) alpha = 0.01 for i_episode in range(num_episodes): print "Epsisode Number Off Policy Q(sigma) 3 Step", i_episode off_policy = behaviour_policy_epsilon_greedy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n) policy = make_epsilon_greedy_policy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n) state = env.reset() next_action = None for t in itertools.count(): if next_action is None: action_probs = off_policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action = next_action state_t_1, reward, done, _ = env.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break # q_values = estimator.predict(state) # q_values_state_action = q_values[action] #evaluate Q(current state, current action) features_state = featurize_state(state) q_values = np.dot(theta.T, features_state) q_values_state_action = q_values[action] #select sigma value probability = 0.5 sigma_t_1 = binomial_sigma(probability) #select next action based on the behaviour policy at next state next_action_probs = off_policy(state_t_1) action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs) # q_values_t_1 = estimator.predict(state_t_1) # q_values_next_state_next_action = q_values_t_1[action_t_1] features_state_1 = featurize_state(state_t_1) q_values_t_1 = np.dot(theta.T, features_state_1) q_values_next_state_next_action = q_values_t_1[action_t_1] on_policy_next_action_probs = policy(state_t_1) on_policy_a_t_1 = np.random.choice(np.arange(len(on_policy_next_action_probs)), p = on_policy_next_action_probs) V_t_1 = np.sum( on_policy_next_action_probs * q_values_t_1 ) Delta_t = reward + discount_factor * ( sigma_t_1 * q_values_next_state_next_action + (1 - sigma_t_1) * V_t_1 ) - q_values_state_action state_t_2, next_reward, done, _ = env.step(action_t_1) if done: break next_next_action_probs = off_policy(state_t_2) action_t_2 = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs) # q_values_t_2 = estimator.predict(state_t_2) # q_values_next_next_state_next_next_action = q_values_t_2[action_t_2] features_state_2 = featurize_state(state_t_2) q_values_t_2 = np.dot(theta.T, features_state_2) q_values_next_next_state_next_next_action = q_values_t_2[action_t_2] on_policy_next_next_action_probs = policy(state_t_2) on_policy_a_t_2 = np.random.choice(np.arange(len(on_policy_next_next_action_probs)), p = on_policy_next_next_action_probs) V_t_2 = np.sum( on_policy_next_next_action_probs * q_values_t_2 ) sigma_t_2 = binomial_sigma(probability) Delta_t_1 = next_reward + discount_factor * ( sigma_t_2 * q_values_next_next_state_next_next_action + (1 - sigma_t_2) * V_t_2 ) - q_values_next_state_next_action """ 3 step TD Target --- G_t(2) """ state_t_3, next_next_reward, done, _ = env.step(action_t_2) if done: break next_next_next_action_probs = off_policy(state_t_3) action_t_3 = np.random.choice(np.arange(len(next_next_next_action_probs)), p = next_next_next_action_probs) features_state_3 = featurize_state(state_t_3) q_values_t_3 = np.dot(theta.T,features_state_3) q_values_next_next_next_state_next_next_next_action = q_values_t_3[action_t_3] on_policy_next_next_next_action_probs = policy(state_t_3) on_policy_a_t_3 = np.random.choice(np.arange(len(on_policy_next_next_next_action_probs)), p = on_policy_next_next_next_action_probs) V_t_3 = np.sum(on_policy_next_next_next_action_probs * q_values_t_3) sigma_t_3 = binomial_sigma(probability) Delta_t_2 = next_next_reward + discount_factor * (sigma_t_3 * q_values_next_next_next_state_next_next_next_action + (1 - sigma_t_3) * V_t_3 ) - q_values_next_next_state_next_next_action on_policy_action_probability = on_policy_next_action_probs[on_policy_a_t_1] off_policy_action_probability = next_action_probs[action_t_1] on_policy_next_action_probability = on_policy_next_next_action_probs[on_policy_a_t_2] off_policy_next_action_probability = next_next_action_probs[action_t_2] td_target = q_values_state_action + Delta_t + discount_factor * ( (1 - sigma_t_1) * on_policy_action_probability + sigma_t_1 ) * Delta_t_1 + discount_factor * ( (1 - sigma_t_2) * on_policy_next_action_probability + sigma_t_2 ) * Delta_t_2 """ Computing Importance Sampling Ratio """ rho = np.divide( on_policy_action_probability, off_policy_action_probability ) rho_1 = np.divide( on_policy_next_action_probability, off_policy_next_action_probability ) rho_sigma = sigma_t_1 * rho + 1 - sigma_t_1 rho_sigma_1 = sigma_t_2 * rho_1 + 1 - sigma_t_2 all_rho_sigma = rho_sigma * rho_sigma_1 td_error = td_target - q_values_state_action # estimator.update(state, action, new_td_target) theta[:, action] += alpha * all_rho_sigma * td_error * features_state if done: break state = state_t_1 return stats
def q_learning(env, theta, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1, epsilon_decay=0.999): #q-learning algorithm with linear function approximation here #estimator : Estimator of Q^w(s,a) - function approximator stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) theta = np.random.normal(size=(400, env.action_space.n)) for i_episode in range(num_episodes): print "Episode Number, Q Learning:", i_episode #this policy here is the off policy epsilon greedy policy? #np.argmax(Q[next_state]) - is for the target policy pi which is #greedy (since maximisation over Q) wrt Q(s,a) policy = make_epsilon_greedy_policy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n) #should be tau here for the Temperature - if using Boltzmann exploration policy # off_policy = behaviour_policy_Boltzmann(theta,epsilon * epsilon_decay**i_episode, env.action_space.n ) state = env.reset() next_action = None #for each one step in the environment for t in itertools.count(): if next_action is None: action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action = next_action next_state, reward, done, _ = env.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t #Q values for current state features_state = featurize_state(state) q_values = np.dot(theta.T, features_state) q_values_state_action = q_values[action] #next action #these actions should be based on off policy for Q-learning #taking actions according to the off policy epsilon greedy policy next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) #next state features and Q(s', a') next_features_state = featurize_state(next_state) q_values_next = np.dot(theta.T, next_features_state) q_values_next_state_next_action = q_values_next[next_action] # OR : np.max(q_values_next) #this is for the target policy pio #which is greedy wrt Q(s,a) best_next_action = np.argmax(q_values_next) td_target = reward + discount_factor * q_values_next[ best_next_action] td_error = td_target - q_values_state_action theta[:, action] += alpha * td_error * features_state if done: break state = next_state return stats
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sample when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Make model copier object estimator_copy = ModelParametersCopier(q_estimator, target_estimator) # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # For 'system/' summaries, usefull to check if currrent process looks healthy current_process = psutil.Process() # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step # total_t = sess.run(tf.contrib.framework.get_global_step()) total_t = sess.run(tf.train.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy( q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append(Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos # Add env Monitor wrapper env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] # Maybe update the target estimator if total_t % update_target_estimator_every == 0: estimator_copy.make(sess) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append(Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # Calculate q values and targets q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1) # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon") episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag="episode/reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag="episode/length") episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent") episode_summary.value.add(simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent") q_estimator.summary_writer.add_summary(episode_summary, i_episode) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) return stats
def sarsa(env, estimator, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1, epsilon_decay=1.0): #estimator : Estimator of Q^w(s,a) - function approximator stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) theta = np.random.normal(size=(400, env.action_space.n)) for i_episode in range(num_episodes): print "Episode Number, SARSA:", i_episode #agent policy based on the greedy maximisation of Q policy = make_epsilon_greedy_policy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n) state = env.reset() action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_action = None #for each one step in the environment for t in itertools.count(): next_state, reward, done, _ = env.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t #Q values for current state features_state = featurize_state(state) q_values = np.dot(theta.T, features_state) q_values_state_action = q_values[action] #next action #these actions should be based on off policy for Q-learning next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) #next state features and Q(s', a') next_features_state = featurize_state(next_state) q_values_next = np.dot(theta.T, next_features_state) q_values_next_state_next_action = q_values_next[next_action] td_target = reward + discount_factor * q_values_next_state_next_action td_error = td_target - q_values_state_action theta[:, action] += alpha * td_error * features_state if done: break state = next_state action = next_action return stats
def three_step_tree_backup(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): #Expected SARSA : same algorithm steps as Q-Learning, # only difference : instead of maximum over next state and action pairs # use the expected value Q = defaultdict(lambda: np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): state = env.reset() #steps within each episode for t in itertools.count(): #pick the first action #choose A from S using policy derived from Q (epsilon-greedy) action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, _, _ = env.step(action) next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) next_next_state, next_reward, _, _ = env.step(next_action) next_next_action_probs = policy(next_next_state) next_next_action = np.random.choice(np.arange( len(next_next_action_probs)), p=next_next_action_probs) next_next_next_state, next_next_reward, done, _ = env.step( next_next_action) next_next_next_action_probs = policy(next_next_next_state) next_next_next_action = np.random.choice( np.arange(len(next_next_next_action_probs)), p=next_next_next_action_probs) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t #updates for the Three Step Tree Backup #V = sum_a pi(a, s_{t+1})Q(s_{t+1}, a) V = np.sum(next_action_probs * Q[next_state]) One_Step = reward + discount_factor * V next_V = np.sum(next_next_action_probs * Q[next_next_state]) Delta_1 = next_reward + discount_factor * next_V - Q[next_state][ next_action] next_action_selection_probability = np.max(next_action_probs) Two_Step = discount_factor * next_action_selection_probability * Delta_1 next_next_V = np.sum(next_next_next_action_probs * Q[next_next_next_state]) Delta_2 = next_next_reward + discount_factor * next_next_V - Q[ next_next_state][next_next_action] next_next_action_selection_probability = np.max(next_next_action) Three_Step = discount_factor * next_action_selection_probability * discount_factor * next_next_action_selection_probability * Delta_2 td_target = One_Step + Two_Step + Three_Step td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state return Q, stats
def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.8): # , epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Gamma discount factor. alpha: TD learning rate. epsilon: Chance the sample a random action. Float between 0 and 1. Returns: A tuple (Q, episode_lengths). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) memory = defaultdict(tuple) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # The policy we're following # policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in tqdm(range(num_episodes)): if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") # Reset the environment and pick the first action eigen_state = env.reset() # One step in the environment # total_reward = 0.0 for t in itertools.count(): if eigen_state in memory: mem_list = memory[eigen_state] action = mem_list[0] state_value = mem_list[1] next_state = mem_list[2] if next_state in memory: next_state_value = memory[next_state][1] else: next_state_value = 0.0 reward = mem_list[3] circuit = QuantumCircuit(2, 2) circuit.h(0) circuit.h(1) circuit = groverIteration(circuit, action, reward, next_state_value) else: # Prepare the n-qubit registers circuit = QuantumCircuit(2, 2) circuit.h(0) circuit.h(1) state_value = 0.0 action = collapse_action_select_method(circuit) next_eigen_state, reward, done = env.step(action) if next_eigen_state in memory: mem_list = memory[next_eigen_state] next_state_value = mem_list[1] else: next_state_value = 0.0 # Update state value state_value = state_value + alpha * ( reward + (discount_factor * next_state_value) - state_value) # print(state_value) memory[eigen_state] = (action, state_value, next_eigen_state, reward) stats.episode_rewards[i_episode] += (discount_factor**t) * reward stats.episode_lengths[i_episode] = t if done: break # state = next_state eigen_state = next_eigen_state return Q, stats, memory
def qlearning_alpha_e_greedy(env, n_episodes=2000, gamma=0.99, alpha=0.85, best_enabled=False): nS = env.observation_space.n nA = env.action_space.n if best_enabled: # record your best-tuned hyperparams here env.seed(0) np.random.seed(0) alpha = 0.05 gamma = 0.99 epsilon_decay = 0.95 e = 1.0 Q = np.zeros([nS, nA]) policy = make_decay_e_greedy_policy(Q, nA) # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(n_episodes), episode_rewards=np.zeros(n_episodes)) for i in range(n_episodes): # useful for debuggin log_episode(i, n_episodes) s = env.reset() done = False total_reward = 0 if best_enabled: e *= epsilon_decay else: e = 1.0 /((i/10) + 1.0) for t in itertools.count(): # Choose action by decaying e-greedy probs = policy(s, e) a = np.random.choice(np.arange(nA), p=probs) # take a step next_s, r, done, _ = env.step(a) if best_enabled: mod_r = modify_reward(r, done) td_target = mod_r + gamma * np.max(Q[next_s, :]) else: td_target = r + gamma * np.max(Q[next_s, :]) td_delta = td_target - Q[s, a] Q[s, a] += alpha * td_delta s = next_s total_reward += r if done: break # Update statistics stats.episode_rewards[i] += total_reward stats.episode_lengths[i] = t return Q, stats
eps=dropout_prob) memory = ReplayMemory(max_size=100000) print('Experiment Number ', e) loss_per_ep = [] w1_m_per_ep = [] w2_m_per_ep = [] w3_m_per_ep = [] total_reward = [] ep = 0 avg_Rwd = -np.inf episode_end_msg = 'loss={:2.10f}, w1_m={:3.1f}, w2_m={:3.1f}, w3_m={:3.1f}, total reward={}' stats = plotting.EpisodeStats(episode_lengths=np.zeros(max_n_ep), episode_rewards=np.zeros(max_n_ep)) """ Loop for the number of episodes : For every episode """ while avg_Rwd < min_avg_Rwd and ep < max_n_ep: if ep >= n_avg_ep: avg_Rwd = np.mean(total_reward[ep - n_avg_ep:ep]) print("EPISODE {}. Average reward over the last {} episodes: {}.". format(ep, n_avg_ep, avg_Rwd)) else: print("EPISODE {}.".format(ep)) """ Contains loop for every step within the episode """ loss_v, w1_m, w2_m, w3_m, cum_R, step_length, variance_steps = run_episode(
def three_step_tree_backup(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): Q = defaultdict(lambda : np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): print "Episode Number, Three Step Tree Backup:", i_episode #agent policy based on the greedy maximisation of Q policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) last_reward = stats.episode_rewards[i_episode - 1] state = env.reset() #for each one step in the environment for t in itertools.count(): action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) if done: break stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs) V = np.sum( next_action_probs * Q[next_state]) Delta = reward + discount_factor * V - Q[state][action] next_next_state, next_reward, _, _ = env.step(next_action) next_next_action_probs = policy(next_next_state) next_next_action = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs) next_V = np.sum(next_next_action_probs * Q[next_next_state]) Delta_t_1 = next_reward + discount_factor * next_V - Q[next_state][next_action] next_next_next_state, next_next_reward, _, _ = env.step(next_next_action) next_next_next_action_probs = policy(next_next_next_state) next_next_next_action = np.random.choice(np.arange(len(next_next_next_action_probs)), p = next_next_next_action_probs) next_next_V = np.sum(next_next_next_action_probs * Q[next_next_next_state]) Delta_t_2 = next_next_reward + discount_factor * next_next_V - Q[next_next_state][next_next_action] next_action_selection_probability = np.max(next_action_probs) next_next_action_selection_probability = np.max(next_next_action_probs) td_target = Q[state][action] + Delta + discount_factor * next_action_selection_probability * Delta_t_1 + discount_factor * discount_factor * next_action_selection_probability * next_next_action_selection_probability * Delta_t_2 td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta state = next_state return stats
def Q_Sigma_Off_Policy(env, theta, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=0.99): #q-learning algorithm with linear function approximation here #estimator : Estimator of Q^w(s,a) - function approximator stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) cumulative_errors = np.zeros(shape=(num_episodes, 1)) alpha = 0.01 tau=1 for i_episode in range(num_episodes): #state_count=np.zeros(shape=(env.observation_space.n,1)) print ("Epsisode Number Off Policy Q(sigma)", i_episode) off_policy = behaviour_policy_Boltzmann(theta, tau, env.action_space.n) policy = make_epsilon_greedy_policy(theta, epsilon * epsilon_decay**i_episode, env.action_space.n) state = env.reset() next_action = None for t in itertools.count(): if next_action is None: action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action = next_action state_t_1, reward, done, _ = env.step(action) if done: break stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # q_values = estimator.predict(state) # q_values_state_action = q_values[action] #evaluate Q(current state, current action) features_state = featurize_state(state) q_values = np.dot(theta.T, features_state) q_values_state_action = q_values[action] #select sigma value sigma_t_1=binomial_sigma(0.5) #select next action based on the behaviour policy at next state next_action_probs = off_policy(state_t_1) action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs) # q_values_t_1 = estimator.predict(state_t_1) # q_values_next_state_next_action = q_values_t_1[action_t_1] features_state_1 = featurize_state(state_t_1) q_values_t_1 = np.dot(theta.T, features_state_1) q_values_next_state_next_action = q_values_t_1[action_t_1] V_t_1 = np.sum( next_action_probs * q_values_t_1 ) Delta_t = reward + discount_factor * ( sigma_t_1 * q_values_next_state_next_action + (1 - sigma_t_1) * V_t_1 ) - q_values_state_action """ target for one step 1 step TD Target --- G_t(1) """ td_target = q_values_state_action + Delta_t td_error = td_target - q_values_state_action # estimator.update(state, action, new_td_target) theta[:, action] += alpha * td_error * features_state rms_error = np.sqrt(np.sum((td_error)**2)) cumulative_errors[i_episode, :] += rms_error state = state_t_1 return stats,cumulative_errors
def Q_Sigma_Off_Policy_2_Step(env, num_episodes, discount_factor=1.0, alpha=0.1, epsilon=0.1): Q = defaultdict(lambda : np.zeros(env.action_space.n)) stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) tau = 1 tau_decay = 0.999 sigma = 1 sigma_decay = 0.995 for i_episode in range(num_episodes): print "Number of Episodes, Q(sigma) Off Policy 2 Step", i_episode policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) off_policy = behaviour_policy_epsilon_greedy(Q, tau, env.action_space.n) tau = tau * tau_decay if tau < 0.0001: tau = 0.0001 state = env.reset() for t in itertools.count(): action_probs = off_policy(state) action = np.random.choice(np.arange(len(action_probs)), p = action_probs) state_t_1, reward, done, _ = env.step(action) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: sigma = sigma * sigma_decay if sigma < 0.0001: sigma = 0.0001 break # probability = 0.5 # sigma_t_1 = binomial_sigma(probability) sigma_t_1 = sigma next_action_probs = off_policy(state_t_1) action_t_1 = np.random.choice(np.arange(len(next_action_probs)), p = next_action_probs) on_policy_next_action_probs = policy(state_t_1) on_policy_a_t_1 = np.random.choice(np.arange(len(on_policy_next_action_probs)), p = on_policy_next_action_probs) V_t_1 = np.sum( on_policy_next_action_probs * Q[state_t_1] ) Delta_t = reward + discount_factor * ( sigma_t_1 * Q[state_t_1][action_t_1] + (1 - sigma_t_1) * V_t_1 ) - Q[state][action] state_t_2, next_reward, _, _ = env.step(action_t_1) next_next_action_probs = off_policy(state_t_2) action_t_2 = np.random.choice(np.arange(len(next_next_action_probs)), p = next_next_action_probs) on_policy_next_next_action_probs = policy(state_t_2) on_policy_a_t_2 = np.random.choice(np.arange(len(on_policy_next_next_action_probs)), p = on_policy_next_next_action_probs) V_t_2 = np.sum( on_policy_next_next_action_probs * Q[state_t_2]) sigma_t_2 = sigma Delta_t_1 = next_reward + discount_factor * ( sigma_t_2 * Q[state_t_2][action_t_2] + (1 - sigma_t_2) * V_t_2 ) - Q[state_t_1][action_t_1] """ 2 step TD Target --- G_t(2) """ on_policy_action_probability = on_policy_next_action_probs[on_policy_a_t_1] off_policy_action_probability = next_action_probs[action_t_1] td_target = Q[state][action] + Delta_t + discount_factor * ( (1 - sigma_t_1) * on_policy_action_probability + sigma_t_1 ) * Delta_t_1 """ Computing Importance Sampling Ratio """ rho = np.divide( on_policy_action_probability, off_policy_action_probability ) rho_sigma = sigma_t_1 * rho + 1 - sigma_t_1 td_error = td_target - Q[state][action] Q[state][action] += alpha * rho_sigma * td_error state = state_t_1 return stats
def double_qNetwork(env, n_episodes=3000, gamma=0.99, alpha=0.85, best_enabled=False, log_by_step=False, result_per_episode=100, network_type='LR'): nS = env.observation_space.n nA = env.action_space.n hidden_size = 30 subRewardList = [] avgRewardList = [] def one_hot(x): return np.identity(nS)[x:x + 1] if best_enabled: # record your best-tuned hyperparams here env.seed(0) np.random.seed(0) alpha = 0.003 gamma = 0.99 epsilon_decay = 0.95 e = 1.0 X = tf.placeholder(shape=[1, nS], dtype=tf.float32) Y = tf.placeholder(shape=[1, nA], dtype=tf.float32) if network_type == 'NN': W1_1 = tf.get_variable( "W1_1", shape=[nS, hidden_size], initializer=tf.contrib.layers.xavier_initializer()) Z1_1 = tf.matmul(X, W1_1) Z1_1 = tf.nn.tanh(Z1_1) W2_1 = tf.get_variable( "W2_1", shape=[hidden_size, nA], initializer=tf.contrib.layers.xavier_initializer()) Qpred_1 = tf.matmul(Z1_1, W2_1) W1_2 = tf.get_variable( "W1_2", shape=[nS, hidden_size], initializer=tf.contrib.layers.xavier_initializer()) Z1_2 = tf.matmul(X, W1_2) Z1_2 = tf.nn.tanh(Z1_2) W2_2 = tf.get_variable( "W2_2", shape=[hidden_size, nA], initializer=tf.contrib.layers.xavier_initializer()) Qpred_2 = tf.matmul(Z1_2, W2_2) else: # network_type == 'LR': (Logistic Regression) W_1 = tf.Variable(tf.random_uniform([nS, nA], 0, 0.01)) W_2 = tf.Variable(tf.random_uniform([nS, nA], 0, 0.01)) Qpred_1 = tf.matmul(X, W_1) Qpred_2 = tf.matmul(X, W_2) loss_1 = tf.reduce_sum(tf.square(Y - Qpred_1)) train_1 = tf.train.GradientDescentOptimizer( learning_rate=alpha).minimize(loss_1) loss_2 = tf.reduce_sum(tf.square(Y - Qpred_2)) train_2 = tf.train.GradientDescentOptimizer( learning_rate=alpha).minimize(loss_2) init = tf.global_variables_initializer() # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(n_episodes), episode_rewards=np.zeros(n_episodes)) with tf.Session() as sess: sess.run(init) for i in range(n_episodes): s = env.reset() total_reward = 0 e = 1. / ((i / 10) + 10) # 실행하면서 낮춰주도록 함 done = False count = 0 Qpred_update = None Qpred_other = None train = None for t in itertools.count(): # With 0.5 probability if np.random.choice(2) == 1: Qpred_update = Qpred_1 Qpred_other = Qpred_2 train = train_1 else: Qpred_update = Qpred_2 Qpred_other = Qpred_1 train = train_2 Qs = sess.run(Qpred_update, feed_dict={X: one_hot(s)}) if log_by_step: print(Qs) if np.random.rand(1) < e: a = env.action_space.sample() else: a = np.argmax(Qs) s1, reward, done, _ = env.step(a) if log_by_step: print( 'step %d, curr state : %d, action : %d, next state : %d, reward : %d' % (t, s, a, s1, reward)) if best_enabled: mod_reward = modify_reward(reward, done) if done: Qs[0, a] = mod_reward else: Qs1 = sess.run(Qpred_other, feed_dict={X: one_hot(s1)}) Qs[0, a] = mod_reward + gamma * np.max(Qs1) else: if done: Qs[0, a] = reward else: Qs1 = sess.run(Qpred_other, feed_dict={X: one_hot(s1)}) Qs[0, a] = reward + gamma * np.max(Qs1) sess.run(train, feed_dict={X: one_hot(s), Y: Qs}) total_reward += reward s = s1 count += 1 if done: break subRewardList.append(total_reward) if (i + 1) % result_per_episode == 0: avg = sum(subRewardList) / result_per_episode avgRewardList.append(avg) print(i + 1, ' episode =', total_reward, ', avg =', avg) subRewardList = [] # Update statistics stats.episode_rewards[i] += total_reward stats.episode_lengths[i] = t return stats, subRewardList, avgRewardList