def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False): """ score """ weight = np.random.rand(4, 2) episode_rewards = [] for episode in range(nb_episodes): state = env.reset()[None, :] gradients = [] rewards = [] score = 0 while True: if show_result and (episode % 1000 == 0): env.render() action, grad = policy_gradient(state, weight) next_state, reward, done, _ = env.step(action) next_state = next_state[None, :] gradients.append(grad) rewards.append(reward) score += reward state = next_state if done: break for i in range(len(gradients)): weight += alpha * gradients[i] *\ sum([r * gamma ** r for _, r in enumerate(rewards[i:])]) episode_rewards.append(score) print("{}: {}".format(episode, score), end="\r", flush=False) return episode_rewards
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False): """ write a function that implements a full training""" env = gym.make('CartPole-v0') np.random.seed(1) weight = np.random.rand(4, 2) episode_rewards = [] for e in range(nb_episodes): state = env.reset()[None, :] grads = [] rewards = [] score = 0 while True: if show_result and (e % 1000 == 0): env.render() action, grad = policy_gradient(state, weight) next_state, reward, done, _ = env.step(action) next_state = next_state[None, :] grads.append(grad) rewards.append(reward) score += reward state = next_state if done: break for i in range(len(grads)): weight += alpha * grads[i] *\ sum([ r * (gamma ** r) for t, r in enumerate(rewards[i:])]) episode_rewards.append(score) return episode_rewards
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False): '''implements a full training. Args: env: initial environment nb_episodes: number of episodes used for training alpha: the learning rate gamma: the discount factor Return:all values of the score(sum of all rewards during one episode loop) ''' W = np.random.rand(4, 2) episode_rewards = [] for e in range(nb_episodes): state = env.reset()[None, :] grads = [] rewards = [] score = 0 while True: if show_result and (e % 1000 == 0): env.render() action, grad = policy_gradient(state, W) next_state, reward, done, info = env.step(action) next_state = next_state[None, :] grads.append(grad) rewards.append(reward) score += reward state = next_state if done: break for i in range(len(grads)): W += alpha * grads[i] *\ sum([r * gamma**r for t, r in enumerate(rewards[i:])]) episode_rewards = rewards.append(score) print("{}: {}".format(e, score), end="\r", flush=False) return episode_rewards
def eval_round(i, docs): print 'Training %d' % i theta = policy_gradient.policy_gradient(docs) print 'Theta:', list(theta) print 'Evaluating %d' % i doc_pct, cmd_pct = evaluate("data/sendacard_mturk_corpus.tsv", theta, "http://localhost:8000") print i, "Doc Pct: " , doc_pct , " Cmd Pct: " , cmd_pct return doc_pct, cmd_pct
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False): """ Function to train an agent applying monte carlo policy gradient. - env is the initial environment - nb_episodes is the number of episodes used for training - alpha is the learning rate - gamma is the discount factor Returns: all values of the score (sum of all rewards during one episode loop). """ n_obs = env.observation_space.shape[0] n_actions = env.action_space.n policy_weights = np.random.rand(n_obs, n_actions) scores = [] for episode in range(nb_episodes): state = env.reset()[None, :] gradients = [] rewards = [] score = 0 # Run an episode done = False while not done: if show_result and episode % 1000 == 0: env.render() action, gradient = policy_gradient(state, policy_weights) state, reward, done, _ = env.step(action) state = state[None, :] gradients.append(gradient) rewards.append(reward) score += reward scores.append(score) # Policy update num_steps = len(gradients) discount_factor = gamma**np.arange(num_steps) for i in range(num_steps): rews_after_step = rewards[i:] discount_factors = discount_factor[:len(rews_after_step)] disc_reward = np.sum(rews_after_step * discount_factors) policy_weights += alpha * gradients[i] * disc_reward print("{}: {}".format(episode, score), end="\r", flush=False) return scores
def play_episode(env, weight, i, show_result): """Plays a single i""" state = env.reset()[None, :] state_action_reward_grad = [] while True: if show_result and (i % 1000 == 0): env.render() action, grad = policy_gradient(state, weight) state, reward, done, _ = env.step(action) state = state[None, :] state_action_reward_grad.append((state, action, reward, grad)) if done: break env.close() return state_action_reward_grad
def single_episode(env, weight, episode, show_result): """play one episode""" state = env.reset()[None, :] return_grad = [] while True: if show_result and (episode % 1000 == 0): env.render() action, grad = policy_gradient(state, weight) state, reward, done, _ = env.step(action) state = state[None, :] return_grad.append((state, action, reward, grad)) if done: break env.close() return return_grad
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False): """Train agent cartpole game Args: -> env: initial environment -> nb_episodes: number of episodes used for training -> alpha: the learning rate -> gamma: the discount factor -> show_result: ender the environment every 1000 episodes computed Return: all values of the score (sum of all rewards during one episode loop) """ w = np.random.rand(4, 2) all_scores = [] for ep in range(nb_episodes + 1): state = env.reset()[None, :] rewards = [] score = 0 grads = [] while True: if show_result and (ep % 1000 == 0): env.render() action, grad = policy_gradient(state, w) new_st, reward, done, _ = env.step(action) new_st = new_st[None, :] grads.append(grad) rewards.append(reward) score += reward state = new_st if done: break for i in range(len(grads)): discounts = sum([r * gamma**r for r in rewards[i:]]) w += alpha * grads[i] * discounts all_scores.append(score) print("Ep: {}, Score: {}".format(ep, score), end='\r', flush=False) return all_scores
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False): """ train the policy gradients env: the initia environment (from openai gym) nb_episodes: number of episodes for training alpha: learning rate gamma: discount factor Returns: all vlaues of the score (sum of rewards during ea. episode) """ # initializ future return scores = [] # initialize rnadom starting weights weights = np.random.rand(env.observation_space.shape[0], env.action_space.n) # loop through episodes performing steps for ep in range(nb_episodes): state = env.reset()[None, :] # initialize variables for the ep grads = [] rewards = [] actions = [] done = False counter = 0 # run episode while not done: if show_result and ep % 1000 == 0: env.render() # if using colab be ware of import changes action, grad = pg.policy_gradient(state, weights) state, reward, done, info = env.step(action) state = state[None, :] grads.append(grad) rewards.append(reward) actions.append(action) counter += 1 # when episodes ended calculate rewards/new weights for i in range(len(grads)): # Loop through everything that happend in the episode rew = sum([r * (gamma**r) for t, r in enumerate(rewards[i:])]) weights += alpha * grads[i] * rew # end_reward = 0 # for i in range(counter) # end_reward = reward[counter - i] + end_reward * gamma # weights[:, action] += alpha * grad[:, action] * scores.append(sum(rewards)) print(ep, scores[ep], end="\r", flush=False) return scores
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False): """Train a policy based Monte Carlo/REINFORCE algorithm""" scores = [] weights = np.random.rand(env.observation_space.shape[0], env.action_space.n) for episode in range(nb_episodes): state = env.reset()[None, :] grads = [] rewards = [] actions = [] done = 0 while not done: if show_result and not episode % 1000: env.render() action, grad = policy_gradient(state, weights) # print("step", action, grad) state, reward, done, info = env.step(action) grads.append(grad) rewards.append(reward) actions.append(action) total_reward = 0 """ if not episode %100 and 0: print("weights", weights) """ for grad, reward, action in zip(grads[::-1], rewards[::-1], actions[::-1]): total_reward = reward + total_reward * gamma weights[:, action] += alpha * grad[:, action] * total_reward """ if not episode % 100 and 0: # print(total_reward, grad) # print(alpha * grad * total_reward) """ scores.append(sum(rewards)) print(episode, sum(rewards)) """ if not episode % 100: print(episode, sum(rewards)) #print("grads", len(grads), grads) #print("rewards", len(rewards), rewards[::-1]) #print(weights) """ return scores
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False): """Train a policy based Monte Carlo""" scores = [] weights = np.random.rand(env.observation_space.shape[0], env.action_space.n) for episode in range(nb_episodes): state = env.reset()[None, :] grads = [] rewards = [] actions = [] done = 0 while not done: if show_result and not episode % 1000: env.render() action, grad = policy_gradient(state, weights) state, reward, done, info = env.step(action) grads.append(grad) rewards.append(reward) actions.append(action) total_reward = 0
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False): """ env: initial environment nb_episodes: number of episodes used for training alpha: the learning rate gamma: the discount factor show_result: is True, render the environment every 1000 episodes computed. """ weights = np.random.rand(4, 2) episode = [] for i in range(nb_episodes): state = env.reset()[None, :] grads = [] rewards = [] score = 0 while True: if show_result and (i % 1000 == 0): env.render() action, grad = policy_gradient(state, weights) next_state, reward, done, _ = env.step(action) next_state = next_state[None, :] grads.append(grad) rewards.append(reward) score += reward state = next_state if done: break for j in range(len(grads)): weights += alpha * grads[j] *\ sum([r * gamma ** r for t, r in enumerate(rewards[j:])]) episode.append(score) print("{}: {}".format(i, score), end="\r", flush=False) return (episode)
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False): """ ********************************************* ******Implementation of a full training****** ********************************************* @env: initial environment @nb_episodes: number of episodes used for training @alpha: the learning rate @gamma: the discount factor Return: all values of the score (sum of all rewards during one episode loop) """ # Initiate scores list scores = [] # Initiate θ to random # np.random.seed(0) # env.seed(0) W = np.random.rand(env.observation_space.shape[0], env.action_space.n) for ep in range(nb_episodes): # **** Generating episode ***************************** # Reseting the environment each time as per requirement state = env.reset()[None, :] # initiate needed variabes done = False t = 0 R = [] Grads = [] Actions = [] while not done: # Renderig the environment every 1000 if show_result and not ep % 1000: env.render() # Taking action and gradient action, grad = policy_gradient(state, W) # Getting the reward and outcome state new_state, Returns, done, info = env.step(action) # Appending needed Values Actions.append(action) R.append(Returns) Grads.append(grad) # Incrementing state state = new_state[None, :] t += 1 # Appending summed score scores.append(sum(R)) print("Episode N°: " + str(ep) + " Score: " + str(sum(R)), end="\r", flush=False) # **** Updating θ *************************************************** # initiate needed variabes G = 0 # empirical return T = t for t in range(T): Returns = R[t] action = Actions[t] # Gt = ∑k=0 to ∞ (γ^(k) * R(t+k+1)) G = sum(gamma**(k) * R[k+t+1] for k in range(T-t-1)) # θ ← θ + α * γ^(t) * Gt * ∇θlnπθ(At|St) ; from Barto Satton book # W[:, action] += alpha * Grads[t][:, action] * gamma**(t) * G # θ ← θ + α * ∇θlogπθ(st, at) * vt ; from David Silver course W[:, action] += alpha * Grads[t][:, action] * G return scores
#!/usr/bin/env python3 """ Main file """ import gym import numpy as np from policy_gradient import policy_gradient env = gym.make('CartPole-v1') np.random.seed(1) weight = np.random.rand(4, 2) state = env.reset()[None,:] # state = [[0.04228739, -0.04522399, 0.01190918, -0.03496226]] # state = np.array(state) print(weight) print(state) action, grad = policy_gradient(state, weight) print(action) print(grad) env.close()
import argparse from data import gen_docs from policy_gradient import policy_gradient import numpy as np import sys if __name__ == "__main__": parser = argparse.ArgumentParser(description="Train webtalk to generate a parameter vector") parser.add_argument("corpus_file", type=file, help="A corpus file to train off of", default=sys.stdin, nargs='?') parser.add_argument("--url", type=str, help="An initial URL on which to start each training document", default="http://localhost:8000") parser.add_argument("--iters", type=int, help="Number of iterations to train on all the docs", default=50) args = parser.parse_args() docs = gen_docs.parse_docs_file(args.corpus_file) theta = policy_gradient(docs, args.url, ITERATIONS=args.iters) np.savetxt(sys.stdout, theta)