def __init__(self, env, lr, critic_lr, gamma, n, policy_path, critic_path, load=False): # Initializes A2C. # Args: # - model: The actor model. # - lr: Learning rate for the actor model. # - critic_model: The critic model. # - critic_lr: Learning rate for the critic model. # - n: The value of N in N-step A2C. Reinforce.__init__(self, env, lr, gamma=gamma, save_path=policy_path, load=load) self.critic_path = critic_path s_len = self.env.observation_space_shape[0] self.critic = CriticNet(critic_lr, s_len=s_len) self.n = n if load: self.critic.load(self.critic_path) print( "Hyperparameters:\nPolicy LR = {} Critic LR = {} Gamma = {} N = {} \nPolicy Path = {} \nCritic Path = {} \nLoad = {}" .format(lr, critic_lr, gamma, n, policy_path, critic_path, load)) return
def test_ret50(self): print("==================07================") ret50 = test_data["ret50"] ret50_expected = expected_data["ret50"] reinforce = Reinforce(ret50) pprint.pprint("7. ret50 result => "+str(reinforce.event_reco_result)) self.assertEqual(reinforce.event_reco_result, ret50_expected)
def test_ret10(self): print("==================01================") ret10 = test_data["ret10"] ret10_expected = expected_data["ret10"] reinforce = Reinforce(ret10) pprint.pprint("1.ret10 result => "+str(reinforce.event_reco_result["event_info_data"])) self.assertEqual(reinforce.event_reco_result, ret10_expected)
def test_ret31(self): print("==================05================") ret31 = test_data["ret31"] ret31_expected = expected_data["ret31"] reinforce = Reinforce(ret31) pprint.pprint("5. ret31 result => "+str(reinforce.event_reco_result)) self.assertEqual(reinforce.event_reco_result, ret31_expected)
def main(): if len(sys.argv) != 2 or sys.argv[1] not in { "REINFORCE", "REINFORCE_BASELINE" }: print("USAGE: python assignment.py <Model Type>") print("<Model Type>: [REINFORCE/REINFORCE_BASELINE]") exit() env = gym.make("CartPole-v1") state_size = env.observation_space.shape[0] num_actions = env.action_space.n # Initialize model if sys.argv[1] == "REINFORCE": model = Reinforce(state_size, num_actions) elif sys.argv[1] == "REINFORCE_BASELINE": model = ReinforceWithBaseline(state_size, num_actions) # TODO: rewards = [] # 1) Train your model for 650 episodes, passing in the environment and the agent. # 2) Append the total reward of the episode into a list keeping track of all of the rewards. for i in range(650): print(i + 1) rewards.append(train(env, model)) # 3) After training, print the average of the last 50 rewards you've collected. print(tf.reduce_mean(rewards[600:])) # TODO: Visualize your rewards. visualize_data(rewards)
def main(): if len(sys.argv) != 2 or sys.argv[1] not in {"REINFORCE", "REINFORCE_BASELINE"}: print("USAGE: python assignment.py <Model Type>") print("<Model Type>: [REINFORCE/REINFORCE_BASELINE]") exit() env = gym.make("CartPole-v1") # environment state_size = env.observation_space.shape[0] num_actions = env.action_space.n # Initialize model if sys.argv[1] == "REINFORCE": model = Reinforce(state_size, num_actions) elif sys.argv[1] == "REINFORCE_BASELINE": model = ReinforceWithBaseline(state_size, num_actions) # TODO: # 1) Train your model for 650 episodes, passing in the environment and the agent. # 2) Append the total reward of the episode into a list keeping track of all of the rewards. # 3) After training, print the average of the last 50 rewards you've collected. epochs = 650 total_rewards = [] print('TRAIN STARTS:') for epoch in range(epochs): total_reward = train(env, model) total_rewards.append(total_reward) print('\r', 'training process: {0:.2f} %'.format(epoch / epochs * 100), end='') print('\nThe average of last 50 rewards: {}'.format(np.mean(total_rewards[-50:]))) # TODO: Visualize your rewards. visualize_data(total_rewards)
def main(): env = gym.make("forex-v0") # environment state_size = env.observation_space.shape[0] * env.observation_space.shape[1] num_actions = env.action_space.n # Initialize model if args.mode == "REINFORCE": model = Reinforce(state_size, num_actions) elif args.mode == "REINFORCE_BASELINE": model = ReinforceWithBaseline(state_size, num_actions) elif args.mode == "DQN": model = DQN(2) elif args.mode == "RANDOM": pass # TODO: # 1) Train your model for 650 episodes, passing in the environment and the agent. # 2) Append the total reward of the episode into a list keeping track of all of the rewards. # 3) After training, print the average of the last 50 rewards you've collected. rewards = [] profits = [] if args.mode == "REINFORCE" or args.mode == "REINFORCE_BASELINE": try: with tf.device('/device:' + args.device): for i in range(650): print(i) reward, profit = train(env, model) rewards.append(reward) profits.append(profit) except RuntimeError as e: print(e) elif args.mode == "DQN": try: with tf.device('/device:' + args.device): print(args.device) for i in range(1000): print(i) reward, profit = train_dqn(env, model, i) rewards.append(reward) profits.append(profit) except RuntimeError as e: print(e) else: for i in range(1000): print(i) reward, profit = random_call(env) rewards.append(reward) profits.append(profit) print("Average of last 50 rewards:", tf.reduce_mean(rewards[-50:])) # Prints average of final 50 rewards # TODO: Visualize your rewards. visualize_data_rewards(rewards, args.mode) visualize_data_profits(profits, args.mode)
def fig_13_1(): env = Corridor() alg = Reinforce(env, None, FIG_13_1_G, FIG_13_1_THE_DIM, pi_gen_corr, logpi_wrap_corr(env, feat_corr), the_0=None) benchmark(alg, 'Figure 13.1', 'fig13.1')
class ReinforceRunner(Runner): def __init__(self, env_name, algo_params, runner_params): super(ReinforceRunner, self).__init__(env_name, 'Reinforce', algo_params, runner_params) def _before_sim_loop(self): n_state = self._env.observation_space.shape[0] n_action = self._env.action_space.n self._algo = Reinforce(n_state, n_action, self._algo_params) self._score = 0.0 self._score_sum = 0.0 def _episode_sim(self, n_epi): s = self._env.reset() done = False self._score = 0.0 n_step = 0 while not done: prob = self._algo(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample() s_prime, r, done, info = self._step_wrapper( self._env.step(a.item())) if self._train: self._algo.put_data((r / self._reward_scale, prob[a])) if self._save_step_log: self._write_step_log(n_step, n_epi, s, a.item(), r, done) s = s_prime self._score += r n_step += 1 self._score_sum += self._score def _after_sim(self, n_epi, print_log, cond_check): super()._after_sim(n_epi, print_log, cond_check) if not self._done and self._train: self._algo.train_net()
def main(): env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] num_actions = env.action_space.n model = Reinforce(state_size, num_actions) score = 0.0 best_score_so_far = 0.0 for epoch in range(N_epochs): with tf.GradientTape() as tape: # sample trajectory states, actions, rewards, action_probs = generate_trajectory( env, model) discounted_rewards = model.compute_discount(rewards) loss = model.loss(states, actions, discounted_rewards, action_probs) gradients = tape.gradient(loss, model.trainable_variables) model.optimizer.apply_gradients( zip(gradients, model.trainable_variables)) score += np.sum(rewards) if epoch % 20 == 0: print("Epoch [%d/%d] : Reward %d" % (epoch, N_epochs, score / 20)) score = 0 model.save('reincforce_model') print("model saved")
def train(mnist): global args sess = tf.Session() global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 learning_rate = tf.train.exponential_decay(0.99, global_step, 500, 0.96, staircase=True) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) ###args.maxlayers set in parse_args and policy_network is function reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step) ###define data and how to handle net_manager = NetManager(num_input=3072, num_classes=10, learning_rate=0.001, mnist=mnist, bathc_size=100, max_step_per_action=1000) MAX_EPISODES = 500 step = 0 state = np.array([[10.0, 128.0, 1.0, 1.0] * args.max_layers], dtype=np.float32) pre_acc = 0.0 total_rewards = 0 for i_episode in range(MAX_EPISODES): action = reinforce.get_action(state) print("ca:", action) ###Also a for loop if all(ai > 0 for ai in action[0][0]): reward, pre_acc = net_manager.get_reward(action, step, pre_acc) print("=====>", reward, pre_acc) else: reward = -1.0 total_rewards += reward # In our sample action is equal state state = action[0] reinforce.storeRollout(state, reward) step += 1 ls = reinforce.train_step(1) log_str = "current time: " + str( datetime.datetime.now().time()) + " episode: " + str( i_episode) + " loss: " + str(ls) + " last_state: " + str( state) + " last_reward: " + str(reward) + "\n" log = open("lg3.txt", "a+") log.write(log_str) log.close() print(log_str)
def fig_13_2(): env = Corridor() def vhat(s, w): return w[0] def nab_vhat(s, w): return np.ones(1) fig, ax = plt.subplots() fig.suptitle('Figure 13.2', fontsize=BIG_FONT) fig.set_size_inches(20, 14) xticks, yticks = np.linspace(0, 1000, 6), np.linspace(-90, -10, 9) def short_str(x): return str(x)[:3] xnames, ynames = map(short_str, xticks), map(short_str, yticks) alg1 = Reinforce(env, None, FIG_13_2_G, FIG_13_2_THE_DIM, pi_gen_corr, logpi_wrap_corr(env, feat_corr)) alg2 = ReinforceBaseline(env, FIG_13_2_ALP_BAS_W, FIG_13_2_ALP_BAS_T, FIG_13_2_G, FIG_13_2_THE_DIM, pi_gen_corr, logpi_wrap_corr(env, feat_corr), vhat, nab_vhat, FIG_13_2_W_DIM) run(ax, alg2, [FIG_13_2_ALP_BAS_T], FIG_13_2_N_EP, FIG_13_2_N_RUNS, dash=False) run(ax, alg1, [FIG_13_2_ALP], FIG_13_2_N_EP, FIG_13_2_N_RUNS) plot_figure(ax, '', xticks, xnames, 'Episode', list(yticks) + [0], ynames, (f'Total\nReward\non episode\n(Averaged over\n' + f'{FIG_13_1_N_RUNS} runs)'), font=MED_FONT, labelpad=40, loc='upper right') save_plot('fig13.2', dpi=100) plt.show()
def main(flags): ''' Runs an agent in an environment. params: flags (dict): configuration ''' env = gym.make('CartPole-v0') agent = Reinforce(env, gamma=flags.gamma, learning_rate=flags.learning_rate, num_units=flags.num_units, num_layers=flags.num_layers, update_frequency=flags.update_frequency) trainer = Trainer(env, agent, flags) rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps) plot_results(rewards, lengths)
def train(mnist): global args sess = tf.Session() global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(0.99, global_step, 500, 0.96, staircase=True) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step) net_manager = NetManager(num_input=784, num_classes=10, learning_rate=0.001, mnist=mnist, batch_size=100) MAX_EPISODES = 2500 step = 0 state = np.array([[10.0, 128.0, 1.0, 1.0] * args.max_layers], dtype=np.float32) pre_acc = 0.0 total_rewards = 0 for i_episode in range(MAX_EPISODES): action = reinforce.get_action(state) print("action:", action) if all(ai > 0 for ai in action[0][0]): reward, pre_acc = net_manager.get_reward(action, step, pre_acc) print('====>', reward, pre_acc) else: reward = -1.0 total_rewards += reward # In our sample action is equal state state = action[0] reinforce.store_rollout(state, reward) step += 1 ls = reinforce.train_step(1) log_str = 'current time: ' + str( datetime.datetime.now().time()) + ' episode: ' + str( i_episode) + ' loss: ' + str(ls) + ' last_state: ' + str( state) + ' last_reward: ' + str(reward) + '\n' log = open('log.txt', 'a+') log.write(log_str) log.close() print(log_str)
def main(): if len(sys.argv) != 2 or sys.argv[1] not in { "REINFORCE", "REINFORCE_BASELINE" }: print("USAGE: python assignment.py <Model Type>") print("<Model Type>: [REINFORCE/REINFORCE_BASELINE]") exit() env = gym.make("CartPole-v1") # environment state_size = env.observation_space.shape[0] num_actions = env.action_space.n # Initialize model if sys.argv[1] == "REINFORCE": model = Reinforce(state_size, num_actions) elif sys.argv[1] == "REINFORCE_BASELINE": model = ReinforceWithBaseline(state_size, num_actions) # TODO: # 1) Train your model for 650 episodes, passing in the environment and the agent. all_rewards = [] for i in range(650): all_rewards.append(train(env, model)) print("Reward of past 50:", np.mean(all_rewards[-50:]))
if do_PPO: logger = Logger( "logs/quadrotor_12d_PPO_%dx%d_std%f_lr%f_kl%f_%d_%d_dyn_%f_%f_%f_%f_seed_%d.pkl" % (num_layers, num_hidden_units, noise_std, learning_rate, desired_kl, num_rollouts, num_steps_per_rollout, mass_scaling, Ix_scaling, Iy_scaling, Iz_scaling, seed)) solver = PPO(num_iters, learning_rate, desired_kl, discount_factor, num_rollouts, num_steps_per_rollout, dyn, initial_state_sampler, fb, logger) if do_Reinforce: logger = Logger( "logs/quadrotor_12d_Reinforce_%dx%d_std%f_lr%f_kl%f_%d_%d_fromzero_%s_dyn_%f_%f_%f_%f_seed_%d_norm_%d_smallweights_tanh.pkl" % (num_layers, num_hidden_units, noise_std, learning_rate, desired_kl, num_rollouts, num_steps_per_rollout, str(from_zero), mass_scaling, Ix_scaling, Iy_scaling, Iz_scaling, seed, norm)) solver = Reinforce(num_iters, learning_rate, desired_kl, discount_factor, num_rollouts, num_steps_per_rollout, dyn, initial_state_sampler, fb, logger, norm, scale_rewards, state_constraint) # Set number of threads. torch.set_num_threads(1) # Run this guy. solver.run(plot=False, show_diff=False) # Dump the log. logger.dump()
def train(dataset, learning_rate=0.001, batch_size=100, num_input=784, num_classes=10, train_size=100000, test_size=10000): global args sess = tf.Session() global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 learning_rate_op = tf.train.exponential_decay(0.99, global_step, 500, 0.96, staircase=True) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_op) reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step) net_manager = NetManager(num_input=num_input, num_classes=num_classes, learning_rate=learning_rate, dataset=dataset, bathc_size=batch_size, train_size=train_size, test_size=test_size) MAX_EPISODES = 2500 step = 0 state = np.array([[10.0, 128.0, 1.0, 1.0] * args.max_layers], dtype=np.float32) pre_acc = 0.0 total_rewards = 0 max_acc = 0.0 max_action = None for i_episode in range(MAX_EPISODES): action = reinforce.get_action(state) print("ca:", action) if all(ai > 0 for ai in action[0][0]): reward, pre_acc = net_manager.get_reward(action, step, pre_acc) print("=====>", reward, pre_acc) else: reward = -1.0 total_rewards += reward # In our sample action is equal state state = action[0] reinforce.storeRollout(state, reward) step += 1 ls = reinforce.train_step(1) log_str = "current time: " + str( datetime.datetime.now().time()) + " episode: " + str( i_episode) + " loss: " + str(ls) + " last_state: " + str( state) + " last_reward: " + str(reward) + "\n" log_max_str = "current time: " + str(datetime.datetime.now().time( )) + " episode: " + str(i_episode) + " max accuracy: " + str( max_acc) + " action: " + str(max_action) + "\n" log = open("lg3.txt", "a+") log.write(log_str) log.write(log_max_str) log.close() print(log_str) print(log_max_str)
import torch
#w1 = fb._w1(x) #print("w1: ", w1) #w2 = fb._w2(x) #print("w2: ", w2) #mu = fb.feedback(x, v) #nu = fb.noisy_feedback(x, v) #print("mean u = ", mu) #print("noisy u = ", nu) #lp = fb.log_prob(u, x, v) #print("log prob of [1, 1] is ", lp) # Generate v, y desired and check that we can match with no model mismatch. from reinforce import Reinforce r = Reinforce(1, 1, 1, 1, 100, dyn, None, fb) current_x = np.array([[0.1], [0.0], [0.1], [0.0]]) vs = r._generate_v() y_desireds = r._generate_y(current_x, vs) ys = [] print("current x: ", current_x) for v, y_desired in zip(vs, y_desireds): u = dyn._f_q(current_x) + dyn._M_q(current_x) @ v current_x = dyn.integrate(current_x, u, dt=0.001) ys.append(dyn.observation(current_x)) print("ys: ", ys) print("y_desireds:", y_desireds)
def train(mnist): # use globa varialbe args global args # create session to run code sess = tf.Session() # make global_step global_step = tf.Variable(0, trainable=False) # intial lerarning rate # TODO NOT USED VARIALBE. MAY BE USE TO LEARING_RATE at below line's first param starter_learning_rate = 0.1 # apply exponential learning rate decay, start_lr=0.99, global_step, decay_step=500, decay_rate=0.96, staircase= True # staircase => divide by int learning_rate = tf.train.exponential_decay(0.99, global_step, 500, 0.96, staircase=True) # RMSPropOptimizer use above lr optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) # make reinfoce env reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step) # network manager for training subnetworks with Reinforcement Learning net_manager = NetManager(num_input=784, # input dim 28x28 mnist num_classes=10, # number of classes 10 mnist learning_rate=0.001, # intial learning rate mnist=mnist, # dataset mnist (tensorflow dataset object) bathc_size=100) # mini-batch size # maximum episodes to training MAX_EPISODES = 2500 # step start from 0 step = 0 # intial state [cnn_filter_size, cnn_filter_num, maxpool_ksize, dropout_rate] * max_layer state = np.array([[10.0, 128.0, 1.0, 1.0]*args.max_layers], dtype=np.float32) # init previous accuracy and total rewards pre_acc = 0.0 total_rewards = 0 # run episodes for i_episode in range(MAX_EPISODES): # get next action from reinforce action = reinforce.get_action(state) # print action print("ca:", action) # if actions value is all biger then 0 (valid) get reworkd else... if all(ai > 0 for ai in action[0][0]): reward, pre_acc = net_manager.get_reward(action, step, pre_acc) print("=====>", reward, pre_acc) # else reword -1 else: reward = -1.0 # sum all rewards total_rewards += reward # In our sample action is equal state state = action[0] # rollout state. See reinforce code reinforce.storeRollout(state, reward) # step step += 1 # train step ls = reinforce.train_step(1) # logging log_str = "current time: "+str(datetime.datetime.now().time())+" episode: "+str(i_episode)+" loss: "+str(ls)+" last_state: "+str(state)+" last_reward: "+str(reward)+"\n" log = open("lg3.txt", "a+") log.write(log_str) log.close() print(log_str)
def _before_sim_loop(self): n_state = self._env.observation_space.shape[0] n_action = self._env.action_space.n self._algo = Reinforce(n_state, n_action, self._algo_params) self._score = 0.0 self._score_sum = 0.0
estimator=args.algorithm) else: # policy based methods # Initialize the policy if env.actionType == 'discrete': policy = policies.DiscretePolicy(env, args.hiddenLayers) elif env.actionType == 'continuous': policy = policies.GaussianPolicy(env, args.hiddenLayers, args.explorationNoise) else: raise Exception("Unreachable.") # Select a training algorithm. if args.algorithm == 'Reinforce' or args.algorithm == 'PG': from reinforce import Reinforce algo = Reinforce(policy, gamma=args.gamma, learnrate=args.learningRate) elif args.algorithm == 'CMAES': from cmaes_korali import CMAESKorali algo = CMAESKorali(policy, populationSize=args.populationSize, sigma=args.explorationNoise) elif args.algorithm == 'ES': from evolution_strategies import EvolutionStrategies algo = EvolutionStrategies(policy, populationSize=args.populationSize, sigma=args.explorationNoise, learnRate=args.learningRate) # Train / update / improve the policy using the selected algorithm. algo.trainPolicy(policy, env, args.numIterations)
help="Random seed for the environment.") parser.add_argument('--num_episodes', type=int, default=1, help="Number of test episodes.") parser.add_argument('--stochastic', action='store_true', help="Use stochastic policy in testing.") parser.add_argument('--record', action='store_true', help="Record videos of test episodes.") parser.add_argument('--video_dir', help="Directory to store recorded videos.") args = parser.parse_args() env = gym.make('LunarLander-v2') env.seed(args.seed) if args.record: env = gym.wrappers.Monitor(env, args.video_dir, force=True) if args.agent_type == 'reinforce': agent = Reinforce(env, 0) elif args.agent_type == 'a2c': agent = A2C(env, 0, args.n) else: print('Unknown agent type %s' % args.agent_type) exit(1) agent.model.load_state_dict( torch.load(args.model_path, map_location=lambda storage, loc: storage)) stochastic = True if args.stochastic else False r_avg, r_std = agent.eval(args.num_episodes, stochastic=stochastic) print('Reward average %.6f std %.6f' % (r_avg, r_std))
import torch import numpy as np from feedback_linearization import FeedbackLinearization from reinforce import Reinforce from quadrotor_14d import Quadrotor14D dyn = Quadrotor14D(1.0, 1.0, 1.0, 1.0, 0.01) fb = FeedbackLinearization(dyn, 2, 32, torch.nn.Tanh(), 0.1) solver = Reinforce(1, 1, 1, 1, 1, 10000, dyn, None, fb, None, 1, 1, None) x0 = np.zeros((14, 1)) #x0[0, 0] = x0[1, 0] = 0.1 #x0[2, 0] = -0.1 x0[9, 0] = 9.81 ref, K = solver._generate_reference() ys, xs = solver._generate_ys(x0, ref, K) import matplotlib.pyplot as plt plt.figure() plt.plot([x[0, 0] for x in xs], label="x") plt.plot([x[1, 0] for x in xs], label="y") plt.plot([x[3, 0] for x in xs], label="yaw") plt.plot([x[4, 0] for x in xs], label="pitch") plt.plot([x[5, 0] for x in xs], label="roll") plt.plot([r[0, 0] for r in ref], label="x_ref") plt.plot([r[4, 0] for r in ref], label="y_ref") plt.plot([r[12, 0] for r in ref], label="yaw_ref") plt.legend()
def train(): global args sess = tf.Session() global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 num_of_hyperparameters = 1 learning_rate = tf.train.exponential_decay(0.99, global_step, 500, 0.96, staircase=True) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step, num_of_hyperparameters) workflow_manager = WorkflowManager(num_of_hyperparameters, ser_url=None, usr_name=None, password=None) MAX_EPISODES = 2500 step = 0 state = np.array([[10.0] * num_of_hyperparameters * args.max_layers], dtype=np.float32) pre_acc = 0.0 total_rewards = 0 min_action = 0 max_action = 30 for i_episode in range(MAX_EPISODES): action = reinforce.get_action(state) print("ca:", action) if all(ai > min_action for ai in action[0][0]) and all(ai < max_action for ai in action[0][0]): reward, pre_acc = workflow_manager.get_reward( action, step, pre_acc) print("=====>", reward, pre_acc) else: reward = -1.0 total_rewards += reward # In our sample action is equal state print('action', action) state = action[0] reinforce.storeRollout(state, reward) print('state', state) step += 1 ls = reinforce.train_step(1) log_str = "current time: " + str( datetime.datetime.now().time()) + " episode: " + str( i_episode) + " loss: " + str(ls) + " last_state: " + str( state) + " last_reward: " + str(reward) + "\n" log = open("lg3.txt", "a+") log.write(log_str) log.close() print(log_str)