Beispiel #1
0
 def __init__(self,
              env,
              lr,
              critic_lr,
              gamma,
              n,
              policy_path,
              critic_path,
              load=False):
     # Initializes A2C.
     # Args:
     # - model: The actor model.
     # - lr: Learning rate for the actor model.
     # - critic_model: The critic model.
     # - critic_lr: Learning rate for the critic model.
     # - n: The value of N in N-step A2C.
     Reinforce.__init__(self,
                        env,
                        lr,
                        gamma=gamma,
                        save_path=policy_path,
                        load=load)
     self.critic_path = critic_path
     s_len = self.env.observation_space_shape[0]
     self.critic = CriticNet(critic_lr, s_len=s_len)
     self.n = n
     if load:
         self.critic.load(self.critic_path)
     print(
         "Hyperparameters:\nPolicy LR = {} Critic LR = {} Gamma = {} N = {} \nPolicy Path = {} \nCritic Path = {} \nLoad = {}"
         .format(lr, critic_lr, gamma, n, policy_path, critic_path, load))
     return
	def test_ret50(self):
		print("==================07================")
		ret50 = test_data["ret50"]
		ret50_expected = expected_data["ret50"]		
		reinforce = Reinforce(ret50)			
		pprint.pprint("7. ret50 result => "+str(reinforce.event_reco_result))
		self.assertEqual(reinforce.event_reco_result, ret50_expected)									
	def test_ret10(self):		
		print("==================01================")
		ret10 = test_data["ret10"]
		ret10_expected = expected_data["ret10"]
		reinforce = Reinforce(ret10)
		pprint.pprint("1.ret10 result => "+str(reinforce.event_reco_result["event_info_data"]))		
		self.assertEqual(reinforce.event_reco_result, ret10_expected)	
	def test_ret31(self):
		print("==================05================")
		ret31 = test_data["ret31"]
		ret31_expected = expected_data["ret31"]		
		reinforce = Reinforce(ret31)			
		pprint.pprint("5. ret31 result => "+str(reinforce.event_reco_result))
		self.assertEqual(reinforce.event_reco_result, ret31_expected)					
Beispiel #5
0
def main():
    if len(sys.argv) != 2 or sys.argv[1] not in {
            "REINFORCE", "REINFORCE_BASELINE"
    }:
        print("USAGE: python assignment.py <Model Type>")
        print("<Model Type>: [REINFORCE/REINFORCE_BASELINE]")
        exit()

    env = gym.make("CartPole-v1")
    state_size = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # Initialize model
    if sys.argv[1] == "REINFORCE":
        model = Reinforce(state_size, num_actions)
    elif sys.argv[1] == "REINFORCE_BASELINE":
        model = ReinforceWithBaseline(state_size, num_actions)

    # TODO:
    rewards = []
    # 1) Train your model for 650 episodes, passing in the environment and the agent.
    # 2) Append the total reward of the episode into a list keeping track of all of the rewards.
    for i in range(650):
        print(i + 1)
        rewards.append(train(env, model))
    # 3) After training, print the average of the last 50 rewards you've collected.
    print(tf.reduce_mean(rewards[600:]))
    # TODO: Visualize your rewards.
    visualize_data(rewards)
Beispiel #6
0
def main():
    if len(sys.argv) != 2 or sys.argv[1] not in {"REINFORCE", "REINFORCE_BASELINE"}:
        print("USAGE: python assignment.py <Model Type>")
        print("<Model Type>: [REINFORCE/REINFORCE_BASELINE]")
        exit()

    env = gym.make("CartPole-v1")  # environment
    state_size = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # Initialize model
    if sys.argv[1] == "REINFORCE":
        model = Reinforce(state_size, num_actions) 
    elif sys.argv[1] == "REINFORCE_BASELINE":
        model = ReinforceWithBaseline(state_size, num_actions)

    # TODO: 
    # 1) Train your model for 650 episodes, passing in the environment and the agent.
    # 2) Append the total reward of the episode into a list keeping track of all of the rewards.
    # 3) After training, print the average of the last 50 rewards you've collected.
    epochs = 650
    total_rewards = []
    print('TRAIN STARTS:')
    for epoch in range(epochs):
        total_reward = train(env, model)
        total_rewards.append(total_reward)
        print('\r', 'training process: {0:.2f} %'.format(epoch / epochs * 100), end='')
    print('\nThe average of last 50 rewards: {}'.format(np.mean(total_rewards[-50:])))

    # TODO: Visualize your rewards.
    visualize_data(total_rewards)
def main():

    env = gym.make("forex-v0")  # environment
    state_size = env.observation_space.shape[0] * env.observation_space.shape[1]
    num_actions = env.action_space.n

    # Initialize model
    if args.mode == "REINFORCE":
        model = Reinforce(state_size, num_actions)
    elif args.mode == "REINFORCE_BASELINE":
        model = ReinforceWithBaseline(state_size, num_actions)
    elif args.mode == "DQN":
        model = DQN(2)
    elif args.mode == "RANDOM":
        pass

    # TODO:
    # 1) Train your model for 650 episodes, passing in the environment and the agent.
    # 2) Append the total reward of the episode into a list keeping track of all of the rewards.
    # 3) After training, print the average of the last 50 rewards you've collected.
    rewards = []
    profits = []
    if args.mode == "REINFORCE" or args.mode == "REINFORCE_BASELINE":
        try:
            with tf.device('/device:' + args.device):
                for i in range(650):
                    print(i)
                    reward, profit = train(env, model)
                    rewards.append(reward)
                    profits.append(profit)
        except RuntimeError as e:
            print(e)
    elif args.mode == "DQN":
        try:
            with tf.device('/device:' + args.device):
                print(args.device)
                for i in range(1000):
                    print(i)
                    reward, profit = train_dqn(env, model, i)
                    rewards.append(reward)
                    profits.append(profit)
        except RuntimeError as e:
            print(e)
    else:
        for i in range(1000):
            print(i)
            reward, profit = random_call(env)
            rewards.append(reward)
            profits.append(profit)
    print("Average of last 50 rewards:",
          tf.reduce_mean(rewards[-50:]))  # Prints average of final 50 rewards
    # TODO: Visualize your rewards.
    visualize_data_rewards(rewards, args.mode)
    visualize_data_profits(profits, args.mode)
Beispiel #8
0
def fig_13_1():
    env = Corridor()

    alg = Reinforce(env,
                    None,
                    FIG_13_1_G,
                    FIG_13_1_THE_DIM,
                    pi_gen_corr,
                    logpi_wrap_corr(env, feat_corr),
                    the_0=None)
    benchmark(alg, 'Figure 13.1', 'fig13.1')
class ReinforceRunner(Runner):
    def __init__(self, env_name, algo_params, runner_params):
        super(ReinforceRunner, self).__init__(env_name, 'Reinforce',
                                              algo_params, runner_params)

    def _before_sim_loop(self):
        n_state = self._env.observation_space.shape[0]
        n_action = self._env.action_space.n
        self._algo = Reinforce(n_state, n_action, self._algo_params)
        self._score = 0.0
        self._score_sum = 0.0

    def _episode_sim(self, n_epi):
        s = self._env.reset()
        done = False
        self._score = 0.0
        n_step = 0

        while not done:
            prob = self._algo(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample()
            s_prime, r, done, info = self._step_wrapper(
                self._env.step(a.item()))

            if self._train:
                self._algo.put_data((r / self._reward_scale, prob[a]))
            if self._save_step_log:
                self._write_step_log(n_step, n_epi, s, a.item(), r, done)

            s = s_prime
            self._score += r
            n_step += 1

        self._score_sum += self._score

    def _after_sim(self, n_epi, print_log, cond_check):
        super()._after_sim(n_epi, print_log, cond_check)

        if not self._done and self._train:
            self._algo.train_net()
Beispiel #10
0
def main():
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    num_actions = env.action_space.n

    model = Reinforce(state_size, num_actions)

    score = 0.0
    best_score_so_far = 0.0
    for epoch in range(N_epochs):
        with tf.GradientTape() as tape:
            # sample trajectory
            states, actions, rewards, action_probs = generate_trajectory(
                env, model)
            discounted_rewards = model.compute_discount(rewards)
            loss = model.loss(states, actions, discounted_rewards,
                              action_probs)
        gradients = tape.gradient(loss, model.trainable_variables)
        model.optimizer.apply_gradients(
            zip(gradients, model.trainable_variables))
        score += np.sum(rewards)

        if epoch % 20 == 0:
            print("Epoch [%d/%d] : Reward %d" % (epoch, N_epochs, score / 20))
            score = 0

    model.save('reincforce_model')
    print("model saved")
Beispiel #11
0
def train(mnist):
    global args
    sess = tf.Session()
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.1
    learning_rate = tf.train.exponential_decay(0.99,
                                               global_step,
                                               500,
                                               0.96,
                                               staircase=True)

    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

    ###args.maxlayers set in parse_args and policy_network is function
    reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers,
                          global_step)
    ###define data and how to handle
    net_manager = NetManager(num_input=3072,
                             num_classes=10,
                             learning_rate=0.001,
                             mnist=mnist,
                             bathc_size=100,
                             max_step_per_action=1000)

    MAX_EPISODES = 500
    step = 0
    state = np.array([[10.0, 128.0, 1.0, 1.0] * args.max_layers],
                     dtype=np.float32)
    pre_acc = 0.0
    total_rewards = 0

    for i_episode in range(MAX_EPISODES):
        action = reinforce.get_action(state)
        print("ca:", action)
        ###Also a for loop
        if all(ai > 0 for ai in action[0][0]):
            reward, pre_acc = net_manager.get_reward(action, step, pre_acc)
            print("=====>", reward, pre_acc)
        else:
            reward = -1.0
        total_rewards += reward

        # In our sample action is equal state
        state = action[0]
        reinforce.storeRollout(state, reward)

        step += 1
        ls = reinforce.train_step(1)
        log_str = "current time:  " + str(
            datetime.datetime.now().time()) + " episode:  " + str(
                i_episode) + " loss:  " + str(ls) + " last_state:  " + str(
                    state) + " last_reward:  " + str(reward) + "\n"
        log = open("lg3.txt", "a+")
        log.write(log_str)
        log.close()
        print(log_str)
Beispiel #12
0
def fig_13_2():
    env = Corridor()

    def vhat(s, w):
        return w[0]

    def nab_vhat(s, w):
        return np.ones(1)

    fig, ax = plt.subplots()
    fig.suptitle('Figure 13.2', fontsize=BIG_FONT)
    fig.set_size_inches(20, 14)
    xticks, yticks = np.linspace(0, 1000, 6), np.linspace(-90, -10, 9)

    def short_str(x):
        return str(x)[:3]

    xnames, ynames = map(short_str, xticks), map(short_str, yticks)
    alg1 = Reinforce(env, None, FIG_13_2_G, FIG_13_2_THE_DIM, pi_gen_corr,
                     logpi_wrap_corr(env, feat_corr))
    alg2 = ReinforceBaseline(env, FIG_13_2_ALP_BAS_W, FIG_13_2_ALP_BAS_T,
                             FIG_13_2_G, FIG_13_2_THE_DIM, pi_gen_corr,
                             logpi_wrap_corr(env, feat_corr), vhat, nab_vhat,
                             FIG_13_2_W_DIM)
    run(ax,
        alg2, [FIG_13_2_ALP_BAS_T],
        FIG_13_2_N_EP,
        FIG_13_2_N_RUNS,
        dash=False)
    run(ax, alg1, [FIG_13_2_ALP], FIG_13_2_N_EP, FIG_13_2_N_RUNS)
    plot_figure(ax,
                '',
                xticks,
                xnames,
                'Episode',
                list(yticks) + [0],
                ynames, (f'Total\nReward\non episode\n(Averaged over\n' +
                         f'{FIG_13_1_N_RUNS} runs)'),
                font=MED_FONT,
                labelpad=40,
                loc='upper right')
    save_plot('fig13.2', dpi=100)
    plt.show()
Beispiel #13
0
def main(flags):
    '''
        Runs an agent in an environment.
        params:
            flags (dict): configuration
    '''
    env = gym.make('CartPole-v0')

    agent = Reinforce(env,
                      gamma=flags.gamma,
                      learning_rate=flags.learning_rate,
                      num_units=flags.num_units,
                      num_layers=flags.num_layers,
                      update_frequency=flags.update_frequency)

    trainer = Trainer(env, agent, flags)
    rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps)

    plot_results(rewards, lengths)
Beispiel #14
0
def train(mnist):
    global args
    sess = tf.Session()
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(0.99,
                                               global_step,
                                               500,
                                               0.96,
                                               staircase=True)
    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

    reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers,
                          global_step)
    net_manager = NetManager(num_input=784,
                             num_classes=10,
                             learning_rate=0.001,
                             mnist=mnist,
                             batch_size=100)

    MAX_EPISODES = 2500
    step = 0
    state = np.array([[10.0, 128.0, 1.0, 1.0] * args.max_layers],
                     dtype=np.float32)
    pre_acc = 0.0
    total_rewards = 0

    for i_episode in range(MAX_EPISODES):
        action = reinforce.get_action(state)
        print("action:", action)
        if all(ai > 0 for ai in action[0][0]):
            reward, pre_acc = net_manager.get_reward(action, step, pre_acc)
            print('====>', reward, pre_acc)
        else:
            reward = -1.0
        total_rewards += reward

        # In our sample action is equal state
        state = action[0]
        reinforce.store_rollout(state, reward)

        step += 1
        ls = reinforce.train_step(1)
        log_str = 'current time: ' + str(
            datetime.datetime.now().time()) + ' episode: ' + str(
                i_episode) + ' loss: ' + str(ls) + ' last_state: ' + str(
                    state) + ' last_reward: ' + str(reward) + '\n'
        log = open('log.txt', 'a+')
        log.write(log_str)
        log.close()
        print(log_str)
Beispiel #15
0
def main():
    if len(sys.argv) != 2 or sys.argv[1] not in {
            "REINFORCE", "REINFORCE_BASELINE"
    }:
        print("USAGE: python assignment.py <Model Type>")
        print("<Model Type>: [REINFORCE/REINFORCE_BASELINE]")
        exit()

    env = gym.make("CartPole-v1")  # environment
    state_size = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # Initialize model
    if sys.argv[1] == "REINFORCE":
        model = Reinforce(state_size, num_actions)
    elif sys.argv[1] == "REINFORCE_BASELINE":
        model = ReinforceWithBaseline(state_size, num_actions)

    # TODO:
    # 1) Train your model for 650 episodes, passing in the environment and the agent.
    all_rewards = []
    for i in range(650):
        all_rewards.append(train(env, model))
        print("Reward of past 50:", np.mean(all_rewards[-50:]))
if do_PPO:
    logger = Logger(
        "logs/quadrotor_12d_PPO_%dx%d_std%f_lr%f_kl%f_%d_%d_dyn_%f_%f_%f_%f_seed_%d.pkl"
        % (num_layers, num_hidden_units, noise_std, learning_rate, desired_kl,
           num_rollouts, num_steps_per_rollout, mass_scaling, Ix_scaling,
           Iy_scaling, Iz_scaling, seed))
    solver = PPO(num_iters, learning_rate, desired_kl, discount_factor,
                 num_rollouts, num_steps_per_rollout, dyn,
                 initial_state_sampler, fb, logger)

if do_Reinforce:
    logger = Logger(
        "logs/quadrotor_12d_Reinforce_%dx%d_std%f_lr%f_kl%f_%d_%d_fromzero_%s_dyn_%f_%f_%f_%f_seed_%d_norm_%d_smallweights_tanh.pkl"
        % (num_layers, num_hidden_units, noise_std, learning_rate, desired_kl,
           num_rollouts, num_steps_per_rollout, str(from_zero), mass_scaling,
           Ix_scaling, Iy_scaling, Iz_scaling, seed, norm))
    solver = Reinforce(num_iters, learning_rate, desired_kl, discount_factor,
                       num_rollouts, num_steps_per_rollout, dyn,
                       initial_state_sampler, fb, logger, norm, scale_rewards,
                       state_constraint)

# Set number of threads.
torch.set_num_threads(1)

# Run this guy.
solver.run(plot=False, show_diff=False)

# Dump the log.
logger.dump()
Beispiel #17
0
def train(dataset,
          learning_rate=0.001,
          batch_size=100,
          num_input=784,
          num_classes=10,
          train_size=100000,
          test_size=10000):
    global args
    sess = tf.Session()
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.1
    learning_rate_op = tf.train.exponential_decay(0.99,
                                                  global_step,
                                                  500,
                                                  0.96,
                                                  staircase=True)

    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_op)

    reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers,
                          global_step)
    net_manager = NetManager(num_input=num_input,
                             num_classes=num_classes,
                             learning_rate=learning_rate,
                             dataset=dataset,
                             bathc_size=batch_size,
                             train_size=train_size,
                             test_size=test_size)

    MAX_EPISODES = 2500
    step = 0
    state = np.array([[10.0, 128.0, 1.0, 1.0] * args.max_layers],
                     dtype=np.float32)
    pre_acc = 0.0
    total_rewards = 0
    max_acc = 0.0
    max_action = None
    for i_episode in range(MAX_EPISODES):
        action = reinforce.get_action(state)
        print("ca:", action)
        if all(ai > 0 for ai in action[0][0]):
            reward, pre_acc = net_manager.get_reward(action, step, pre_acc)
            print("=====>", reward, pre_acc)
        else:
            reward = -1.0
        total_rewards += reward

        # In our sample action is equal state
        state = action[0]
        reinforce.storeRollout(state, reward)

        step += 1
        ls = reinforce.train_step(1)
        log_str = "current time:  " + str(
            datetime.datetime.now().time()) + " episode:  " + str(
                i_episode) + " loss:  " + str(ls) + " last_state:  " + str(
                    state) + " last_reward:  " + str(reward) + "\n"
        log_max_str = "current time:  " + str(datetime.datetime.now().time(
        )) + " episode:  " + str(i_episode) + " max accuracy:  " + str(
            max_acc) + " action:  " + str(max_action) + "\n"
        log = open("lg3.txt", "a+")
        log.write(log_str)
        log.write(log_max_str)
        log.close()
        print(log_str)
        print(log_max_str)
Beispiel #18
0
import torch
#w1 = fb._w1(x)
#print("w1: ", w1)

#w2 = fb._w2(x)
#print("w2: ", w2)

#mu = fb.feedback(x, v)
#nu = fb.noisy_feedback(x, v)
#print("mean u = ", mu)
#print("noisy u = ", nu)

#lp = fb.log_prob(u, x, v)
#print("log prob of [1, 1] is ", lp)

# Generate v, y desired and check that we can match with no model mismatch.
from reinforce import Reinforce
r = Reinforce(1, 1, 1, 1, 100, dyn, None, fb)
current_x = np.array([[0.1], [0.0], [0.1], [0.0]])
vs = r._generate_v()
y_desireds = r._generate_y(current_x, vs)

ys = []
print("current x: ", current_x)
for v, y_desired in zip(vs, y_desireds):
    u = dyn._f_q(current_x) + dyn._M_q(current_x) @ v
    current_x = dyn.integrate(current_x, u, dt=0.001)
    ys.append(dyn.observation(current_x))

print("ys: ", ys)
print("y_desireds:", y_desireds)
Beispiel #20
0
def train(mnist):
    # use globa varialbe args
    global args
    # create session to run code
    sess = tf.Session()
    # make global_step 
    global_step = tf.Variable(0, trainable=False)
    # intial lerarning rate # TODO NOT USED VARIALBE. MAY BE USE TO LEARING_RATE at below line's first param
    starter_learning_rate = 0.1
    # apply exponential learning rate decay, start_lr=0.99, global_step, decay_step=500, decay_rate=0.96, staircase= True
    # staircase => divide by int 
    learning_rate = tf.train.exponential_decay(0.99, global_step,
                                           500, 0.96, staircase=True)
    # RMSPropOptimizer use above lr
    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

    # make reinfoce env
    reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step)
    # network manager for training subnetworks with Reinforcement Learning
    net_manager = NetManager(num_input=784,         # input dim 28x28 mnist
                             num_classes=10,        # number of classes 10 mnist
                             learning_rate=0.001,   # intial learning rate
                             mnist=mnist,           # dataset mnist (tensorflow dataset object)
                             bathc_size=100)        # mini-batch size

    # maximum episodes to training
    MAX_EPISODES = 2500
    # step start from 0 
    step = 0
    # intial state [cnn_filter_size, cnn_filter_num, maxpool_ksize, dropout_rate] * max_layer
    state = np.array([[10.0, 128.0, 1.0, 1.0]*args.max_layers], dtype=np.float32)
    # init previous accuracy and total rewards
    pre_acc = 0.0
    total_rewards = 0

    # run episodes
    for i_episode in range(MAX_EPISODES):
        # get next action from reinforce
        action = reinforce.get_action(state)
        # print action
        print("ca:", action)
        # if actions value is all biger then 0 (valid) get reworkd else...
        if all(ai > 0 for ai in action[0][0]):
            reward, pre_acc = net_manager.get_reward(action, step, pre_acc)
            print("=====>", reward, pre_acc)
        # else reword -1
        else:
            reward = -1.0
        # sum all rewards
        total_rewards += reward

        # In our sample action is equal state
        state = action[0]
        # rollout state. See reinforce code
        reinforce.storeRollout(state, reward)

        # step 
        step += 1
        # train step
        ls = reinforce.train_step(1)
        # logging
        log_str = "current time:  "+str(datetime.datetime.now().time())+" episode:  "+str(i_episode)+" loss:  "+str(ls)+" last_state:  "+str(state)+" last_reward:  "+str(reward)+"\n"
        log = open("lg3.txt", "a+")
        log.write(log_str)
        log.close()
        print(log_str)
 def _before_sim_loop(self):
     n_state = self._env.observation_space.shape[0]
     n_action = self._env.action_space.n
     self._algo = Reinforce(n_state, n_action, self._algo_params)
     self._score = 0.0
     self._score_sum = 0.0
Beispiel #22
0
                              estimator=args.algorithm)
    else:  # policy based methods
        # Initialize the policy
        if env.actionType == 'discrete':
            policy = policies.DiscretePolicy(env, args.hiddenLayers)
        elif env.actionType == 'continuous':
            policy = policies.GaussianPolicy(env, args.hiddenLayers,
                                             args.explorationNoise)
        else:
            raise Exception("Unreachable.")

        # Select a training algorithm.
        if args.algorithm == 'Reinforce' or args.algorithm == 'PG':
            from reinforce import Reinforce
            algo = Reinforce(policy,
                             gamma=args.gamma,
                             learnrate=args.learningRate)
        elif args.algorithm == 'CMAES':
            from cmaes_korali import CMAESKorali
            algo = CMAESKorali(policy,
                               populationSize=args.populationSize,
                               sigma=args.explorationNoise)
        elif args.algorithm == 'ES':
            from evolution_strategies import EvolutionStrategies
            algo = EvolutionStrategies(policy,
                                       populationSize=args.populationSize,
                                       sigma=args.explorationNoise,
                                       learnRate=args.learningRate)

    # Train / update / improve the policy using the selected algorithm.
    algo.trainPolicy(policy, env, args.numIterations)
Beispiel #23
0
                        help="Random seed for the environment.")
    parser.add_argument('--num_episodes',
                        type=int,
                        default=1,
                        help="Number of test episodes.")
    parser.add_argument('--stochastic',
                        action='store_true',
                        help="Use stochastic policy in testing.")
    parser.add_argument('--record',
                        action='store_true',
                        help="Record videos of test episodes.")
    parser.add_argument('--video_dir',
                        help="Directory to store recorded videos.")
    args = parser.parse_args()
    env = gym.make('LunarLander-v2')
    env.seed(args.seed)
    if args.record:
        env = gym.wrappers.Monitor(env, args.video_dir, force=True)
    if args.agent_type == 'reinforce':
        agent = Reinforce(env, 0)
    elif args.agent_type == 'a2c':
        agent = A2C(env, 0, args.n)
    else:
        print('Unknown agent type %s' % args.agent_type)
        exit(1)
    agent.model.load_state_dict(
        torch.load(args.model_path, map_location=lambda storage, loc: storage))
    stochastic = True if args.stochastic else False
    r_avg, r_std = agent.eval(args.num_episodes, stochastic=stochastic)
    print('Reward average %.6f std %.6f' % (r_avg, r_std))
import torch
import numpy as np
from feedback_linearization import FeedbackLinearization
from reinforce import Reinforce
from quadrotor_14d import Quadrotor14D

dyn = Quadrotor14D(1.0, 1.0, 1.0, 1.0, 0.01)
fb = FeedbackLinearization(dyn, 2, 32, torch.nn.Tanh(), 0.1)
solver = Reinforce(1, 1, 1, 1, 1, 10000, dyn, None, fb, None, 1, 1, None)

x0 = np.zeros((14, 1))
#x0[0, 0] = x0[1, 0] = 0.1
#x0[2, 0] = -0.1
x0[9, 0] = 9.81

ref, K = solver._generate_reference()
ys, xs = solver._generate_ys(x0, ref, K)

import matplotlib.pyplot as plt
plt.figure()
plt.plot([x[0, 0] for x in xs], label="x")
plt.plot([x[1, 0] for x in xs], label="y")
plt.plot([x[3, 0] for x in xs], label="yaw")
plt.plot([x[4, 0] for x in xs], label="pitch")
plt.plot([x[5, 0] for x in xs], label="roll")

plt.plot([r[0, 0] for r in ref], label="x_ref")
plt.plot([r[4, 0] for r in ref], label="y_ref")
plt.plot([r[12, 0] for r in ref], label="yaw_ref")

plt.legend()
def train():
    global args
    sess = tf.Session()
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.1
    num_of_hyperparameters = 1
    learning_rate = tf.train.exponential_decay(0.99,
                                               global_step,
                                               500,
                                               0.96,
                                               staircase=True)

    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

    reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers,
                          global_step, num_of_hyperparameters)
    workflow_manager = WorkflowManager(num_of_hyperparameters,
                                       ser_url=None,
                                       usr_name=None,
                                       password=None)

    MAX_EPISODES = 2500
    step = 0
    state = np.array([[10.0] * num_of_hyperparameters * args.max_layers],
                     dtype=np.float32)
    pre_acc = 0.0
    total_rewards = 0
    min_action = 0
    max_action = 30

    for i_episode in range(MAX_EPISODES):
        action = reinforce.get_action(state)
        print("ca:", action)
        if all(ai > min_action
               for ai in action[0][0]) and all(ai < max_action
                                               for ai in action[0][0]):
            reward, pre_acc = workflow_manager.get_reward(
                action, step, pre_acc)
            print("=====>", reward, pre_acc)
        else:
            reward = -1.0
        total_rewards += reward
        # In our sample action is equal state

        print('action', action)

        state = action[0]
        reinforce.storeRollout(state, reward)

        print('state', state)

        step += 1
        ls = reinforce.train_step(1)
        log_str = "current time:  " + str(
            datetime.datetime.now().time()) + " episode:  " + str(
                i_episode) + " loss:  " + str(ls) + " last_state:  " + str(
                    state) + " last_reward:  " + str(reward) + "\n"
        log = open("lg3.txt", "a+")
        log.write(log_str)
        log.close()
        print(log_str)