Ejemplo n.º 1
0
def main():
    args = arg_parse_TRPO()
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)

    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
    train(args, env, policy_net, value_net, running_state)
Ejemplo n.º 2
0
    def __init__(self, env, args):
        self.env = env
        self.args = args

        # define the network
        self.net = Network(self.env.observation_space.shape[0],
                           self.env.action_space.shape[0])
        self.old_net = Network(self.env.observation_space.shape[0],
                               self.env.action_space.shape[0])

        # make sure the net and old net have the same parameters
        self.old_net.load_state_dict(self.net.state_dict())

        # define the optimizer
        self.optimizer = torch.optim.Adam(self.net.critic.parameters(),
                                          lr=self.args.lr)

        # define the running mean filter
        self.running_state = ZFilter((self.env.observation_space.shape[0], ),
                                     clip=5)

        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
        self.model_path = self.args.save_dir + self.args.env_name
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

        self.start_episode = 0
def sim_episode(env, policy, max_episode_steps, result_writer):
    """Simulate an episode and store the resulting render."""
    try:
        running_state = ZFilter((env.observation_space.shape[0], ), clip=5)

        state = env.reset()
        state = running_state(state)
        frames_store = []
        for t in range(
                max_episode_steps):  # Don't infinite loop while learning
            # Simulates one episode, i.e., until the agent reaches the terminal state or has taken 10000 steps in the environment
            action_mean, action_log_std, action_std = policy(
                Variable(torch.Tensor([state])))
            action = torch.normal(action_mean,
                                  action_std).detach().data[0].cpu().numpy()
            next_state, reward, done, _ = env.step(action)
            next_state = running_state(next_state)

            state = next_state
            frames = env.render('rgb_array')
            frames_store.append(frames)

        return frames_store

    except Exception as e:
        print(f'Tried running simulation, but got error:{e}')

        return []
 def __init__(self, envs, args, net, env_type='atari'):
     self.envs = envs 
     self.args = args
     self.env_type = env_type
     # define the newtork...
     self.net = net
     self.old_net = copy.deepcopy(self.net)
     # if use the cuda...
     if self.args.cuda:
         self.net.cuda()
         self.old_net.cuda()
     # define the optimizer...
     self.optimizer = optim.Adam(self.net.parameters(), self.args.lr, eps=self.args.eps)
     # running filter...
     if self.env_type == 'mujoco':
         num_states = self.envs.observation_space.shape[0]
         self.running_state = ZFilter((num_states, ), clip=5)
     # check saving folder..
     if not os.path.exists(self.args.save_dir):
         os.mkdir(self.args.save_dir)
     # env folder..
     self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
     if not os.path.exists(self.model_path):
         os.mkdir(self.model_path)
     # get the observation
     self.batch_ob_shape = (self.args.num_workers * self.args.nsteps, ) + self.envs.observation_space.shape
     self.obs = np.zeros((self.args.num_workers, ) + self.envs.observation_space.shape, dtype=self.envs.observation_space.dtype.name)
     if self.env_type == 'mujoco':
         self.obs[:] = np.expand_dims(self.running_state(self.envs.reset()), 0)
     else:
         self.obs[:] = self.envs.reset()
     self.dones = [False for _ in range(self.args.num_workers)]
Ejemplo n.º 5
0
 def sample(self, policy, params=None, gamma=0.95, device='cpu'):
     episodes = BatchEpisodes(batch_size=self.batch_size,
                              gamma=gamma,
                              device=device)
     for i in range(self.batch_size):
         self.queue.put(i)
     for _ in range(self.num_workers):
         self.queue.put(None)
     observations, batch_ids = self.envs.reset()
     running_state = ZFilter((observations.shape[1], ), clip=5)
     for index in range(observations.shape[0]):
         observations[index, :] = running_state(observations[index, :])
     dones = [False]
     while (not all(dones)) or (not self.queue.empty()):
         with torch.no_grad():
             observations_tensor = torch.from_numpy(observations).to(
                 device=device)
             actions_tensor = policy(observations_tensor,
                                     params=params).sample()
             actions = actions_tensor.cpu().numpy()
         new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(
             actions)
         episodes.append(observations, actions, rewards, batch_ids)
         observations, batch_ids = new_observations, new_batch_ids
         for index in range(observations.shape[0]):
             observations[index, :] = running_state(observations[index, :])
     return episodes
Ejemplo n.º 6
0
    def __init__(self, env, policy_network, value_network, gamma=0.99, policy_lr=0.0005, value_lr=0.0005, tau=0.95, 
                    value_update_step=10, policy_update_step=10, 
                    epsilon=0.2, batch_size=64, use_cuda=True, saved_path='saved_models/Walker2d-v1/'):
        # define the parameters...
        self.env = env
        self.gamma = gamma
        self.policy_lr = policy_lr
        self.value_lr = value_lr
        self.tau = tau
        self.value_update_step = value_update_step
        self.policy_update_step = policy_update_step
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.saved_path = saved_path
        # check if cuda is avaiable...
        self.use_cuda = torch.cuda.is_available() and use_cuda

        print('The cuda is avaiable: ' + str(self.use_cuda))

        # define the network...
        self.policy_network = policy_network
        self.value_network = value_network

        if self.use_cuda:
            self.policy_network.cuda()
            self.value_network.cuda()

        # define the optimizer
        self.optimizer_value = torch.optim.Adam(self.value_network.parameters(), lr=self.value_lr)
        self.optimizer_policy = torch.optim.Adam(self.policy_network.parameters(), lr=self.policy_lr)

        # init the Filter...
        self.running_state = ZFilter((self.env.observation_space.shape[0],), clip=5)
    def __init__(self, env, policy_lr, value_lr, tau, gamma, buffer_size,
                 max_time_step, observate_time, batch_size, path,
                 soft_update_step, use_cuda):
        self.env = env
        self.policy_lr = policy_lr
        self.value_lr = value_lr
        self.use_cuda = bool(use_cuda)
        self.tau = tau
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.max_time_step = max_time_step
        self.observate_time = observate_time
        self.batch_size = batch_size
        self.global_time_step = 0
        self.path = path
        self.soft_update_step = soft_update_step

        print('IF USE CUDA: ' + str(self.use_cuda))

        num_inputs = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        # the scale of the action space....
        self.action_scale = self.env.action_space.high[0]

        # build up the network....
        # build the actor_network firstly...
        self.actor_net = models.Policy(num_inputs, self.num_actions)
        self.actor_target_net = models.Policy(num_inputs, self.num_actions)

        # build the critic_network....
        self.critic_net = models.Critic(num_inputs, self.num_actions)
        self.critic_target_net = models.Critic(num_inputs, self.num_actions)

        # if use cuda...
        if self.use_cuda:
            self.actor_net.cuda()
            self.actor_target_net.cuda()

            self.critic_net.cuda()
            self.critic_target_net.cuda()

        # init the same parameters....
        self.actor_target_net.load_state_dict(self.actor_net.state_dict())
        self.critic_target_net.load_state_dict(self.critic_net.state_dict())

        # define the optimize.... add the L2 reg in critic optimzier here...
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=self.policy_lr)
        self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(),
                                                 lr=self.value_lr,
                                                 weight_decay=1e-2)

        # init the filter...
        self.running_state = ZFilter((num_inputs, ), clip=5)
 def __init__(self, args, env):
     # define the arguments and environments...
     self.args = args
     self.env = env
     # define the num of inputs and num of actions
     num_inputs = self.env.observation_space.shape[0]
     num_actions = self.env.action_space.shape[0]
     # define the model save dir...
     self.saved_path = self.args.save_dir + self.args.env_name + '/'
     # check the path
     if not os.path.exists(self.args.save_dir):
         os.mkdir(self.args.save_dir)
     if not os.path.exists(self.saved_path):
         os.mkdir(self.saved_path)
     # define the networks...
     self.policy_network = models.Policy(num_inputs, num_actions)
     self.value_network = models.Value(num_inputs)
     # define the optimizer
     self.optimizer_value = torch.optim.Adam(self.value_network.parameters(), lr=self.args.value_lr, weight_decay=self.args.l2_reg)
     # init the filter...
     self.running_state = ZFilter((num_inputs,), clip=5)
    def __init__(self, env, args):
        # define the parameters...
        self.env = env
        # get the environment's input size and output size
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]
        # get the parameters
        self.args = args
        self.saved_path = 'saved_models/' + str(self.args.env_name) + '/'
        # check the path
        if not os.path.exists(self.saved_path):
            os.mkdir(self.saved_path)

        # check if cuda is avaiable...
        self.use_cuda = torch.cuda.is_available() and self.args.cuda
        print('The cuda is avaiable: ' + str(torch.cuda.is_available()))
        print('If use the cuda: ' + str(self.args.cuda))

        # define the network...
        self.policy_network = models.Policy(num_inputs, num_actions)
        self.value_network = models.Value(num_inputs)

        if self.use_cuda:
            self.policy_network.cuda()
            self.value_network.cuda()

        # define the optimizer
        self.optimizer_value = torch.optim.Adam(
            self.value_network.parameters(),
            lr=self.args.value_lr,
            weight_decay=self.args.l2_reg)
        self.optimizer_policy = torch.optim.Adam(
            self.policy_network.parameters(),
            lr=self.args.policy_lr,
            weight_decay=self.args.l2_reg)

        # init the Filter...
        self.running_state = ZFilter((num_inputs, ), clip=5)
Ejemplo n.º 10
0
        return action_loss.mean()

    def get_kl():
        mean1, log_std1, std1 = policy_net(Variable(states))

        mean0 = Variable(mean1.data)
        log_std0 = Variable(log_std1.data)
        std0 = Variable(std1.data)
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (
            2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)


running_state = ZFilter((num_inputs, ), clip=5)
running_reward = ZFilter((1, ), demean=False, clip=10)

for i_episode in count(1):
    memory = Memory()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        state = env.reset()
        state = running_state(state)

        reward_sum = 0
        for t in range(10000):  # Don't infinite loop while learning
            action = select_action(state)
Ejemplo n.º 11
0
def main(gamma=0.995, env_name='Walker2d-v2', tau=0.97, seed=543, number_of_batches=500,\
        batch_size=5000, maximum_steps=10000, render=False, log_interval=1, entropy_coeff=0.0,\
        clip_epsilon=0.2, use_joint_pol_val=False):

    torch.set_default_tensor_type('torch.DoubleTensor')
    PI = torch.DoubleTensor([3.1415926])

    env = gym.make(env_name)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    env.seed(seed)
    torch.manual_seed(seed)

    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
    opt_policy = optim.Adam(policy_net.parameters(), lr=0.001)
    opt_value = optim.Adam(value_net.parameters(), lr=0.001)

    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []
    plot_rew = []
    for i_episode in range(number_of_batches):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(maximum_steps): # Don't infinite loop while learning
                action = select_action(state, policy_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state, reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t-1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()
        plot_rew.append(reward_batch)
        update_params(batch, policy_net, value_net, gamma, opt_policy, opt_value)

        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))
    
    plot_epi = []
    for i in range (number_of_batches):
        plot_epi.append(i)
    trace = go.Scatter( x = plot_epi, y = plot_rew) 
    layout = go.Layout(title='PPO',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')))

    plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg')
Ejemplo n.º 12
0
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\
        batch_size=5000, maximum_steps=10000, render=False,\
        seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2):
    env = gym.make(env_name)
    #Get number of inputs for A3CActor
    num_inputs = env.observation_space.shape[0]
    #Get number of outputs required for describing action
    num_actions = env.action_space.shape[0]
    env.seed(seed)
    torch.manual_seed(seed)

    actor_net = A3CActor(num_inputs, num_actions)
    actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001)

    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []

    for i_episode in range(number_of_batches):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(maximum_steps):
                action = select_action(state, actor_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()
        update_params(batch, actor_net, actor_optimizer, gamma, tau,
                      clip_epsilon)
        if i_episode % log_interval == 0:
            print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))
    return
Ejemplo n.º 13
0
    policy_net = Observations_Encoder(rows,
                                      cols,
                                      3,
                                      z_dim,
                                      num_goals,
                                      num_actions,
                                      threshold,
                                      device=device).to(device)

    policy_net.train()

    #### Initializing Environment
    env = env_BP_w_display(num_goals)

    #### this is a low pass filter which improves training
    running_state = ZFilter(((3, rows, cols)), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)

    if model_path != "":
        print("MODEL LOADED")
        ckpt = torch.load(model_path)
        policy_net.load_state_dict(ckpt['policy_net'])
        value_net.load_state_dict(ckpt['value_net'])

        running_state.rs._M = ckpt['running_M']
        running_state.rs._S = ckpt['running_S']
        running_state.rs._n = ckpt['running_n']
    else:
        print("NO RL MODEL LOADED")

    ##### Logging Folders #####
Ejemplo n.º 14
0
def test():
    env_id="MotorEnv-v0"
    env = gym.make(env_id)   #创造环境

    num_inputs = env.observation_space.shape[0]
    
    net = torch.load('./models/model6.pkl')
    print(net)
    with torch.no_grad():
        running_state = ZFilter((num_inputs,), clip=5)

        episodes =[]
        eval_rewards =[]
        eval_done = []
        eval_states = []
        eval_input = []
        eval_delta = []
        eval_L = []

        state = env.reset()
        state = running_state(state)
        render = False

        for t in range(int(50 / env.steps)):
                # action = env.action_space.sample()  #随机采样动作
                # observation, reward, done, info = env.step(1)  #与环境交互,获得下一步的时刻

                action = select_action(state, net)  #action
                action = action.data[0].numpy()
                observation, reward, done, info = env.step(action)  #与环境交互,获得下一步的时刻
                state = running_state(observation)



                C = np.array([0, 1, 0, 0])
                L = np.dot(C, observation.reshape(4,1))

                
                # if done:             
                    # break
                    # pass
                # env.render()         #绘制场景
                
                # count+=1
                # time.sleep(0.001)      #每次等待0.2s
                # print(info['input'],env.state, env.counts)
                # print(env.counts)           


                episodes.append(env.counts)
                eval_states.append(observation)
                eval_rewards.append(reward)
                eval_done.append(done)        
                eval_L.append(L)
                
                eval_input.append(info['input'])
                eval_delta.append(info['delta'])        
            
        episodes = np.array(episodes)
        eval_rewards = np.array(eval_rewards)
        eval_states = np.array(eval_states)
        eval_done = np.array(eval_done)
        eval_input = np.array(eval_input)
        eval_delta = np.array(eval_delta)

        fig = plt.figure("VibrationEnv-states")
        plt.plot(episodes, eval_L)
        plt.title("%s"%env_id)
        plt.xlabel("Episode")
        plt.ylabel("eval_states")
        plt.legend(["x","y","p","q"])
        plt.grid()
        plt.show()
            
        fig = plt.figure("VibrationEnv-u")
        plt.plot(episodes, eval_input)
        plt.title("%s"%env_id)
        plt.xlabel("Episode")
        plt.ylabel("eval_states")
        plt.legend(["u"])
        plt.grid()
        plt.show()    

    env.close()
Ejemplo n.º 15
0
value_net = Value(num_inputs)

summary_writer = tensorboardX.SummaryWriter(log_dir)


def select_action(state, deterministic=False):
    state = torch.from_numpy(state).unsqueeze(0)
    action_mean, _, action_std = policy_net(Variable(state))
    if not deterministic:
        action = torch.normal(action_mean, action_std)
    else:
        action = action_mean  # action is mode
    return action


running_state = ZFilter((num_inputs, ), clip=5)

ckpt = torch.load(args.checkpoint)
policy_net.load_state_dict(ckpt['policy_net'])
value_net.load_state_dict(ckpt['value_net'])

running_state.rs._M = ckpt['running_M']
running_state.rs._S = ckpt['running_S']
running_state.rs._n = ckpt['running_n']

eval_hole = []
eval_rewards = []
eval_completed = []
eval_touched = []

for eval_episode in range(args.eval_eps):
Ejemplo n.º 16
0
def test(rank, args, shared_model, opt_ac):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = numpy.zeros(41)

    if args.render:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)

    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    start_time = time.time()

    for i_episode in count(1):
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            state = numpy.array(state)
            #global last_state
            #last_state = state
            #last_state,_ = update_observation(last_state,state)
            #last_state,state = update_observation(last_state,state)
            #print(state.shape[0])
            #print(state[41])
            state = running_state(state)

            reward_sum = 0
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                #timer = time.time()
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)

                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(action)
                    puts('ERROR')
                    return
                #print('NN take:')
                #print(time.time()-timer)
                #print(action)
                #print("------------------------")

                #timer = time.time()
                if args.skip:
                    #env.step(action)
                    _, reward, _, _ = env.step(action)
                    reward_sum += reward
                next_state, reward, done, _ = env.step(action)
                next_state = numpy.array(next_state)
                reward_sum += reward

                #print('env take:')
                #print(time.time()-timer)

                #timer = time.time()

                #last_state ,next_state = update_observation(last_state,next_state)
                next_state = running_state(next_state)
                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                #print('update take:')
                #print(time.time()-timer)

                #timer = time.time()

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #print('memory take:')
                #print(time.time()-timer)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state

            num_steps += (t - 1)
            num_episodes += 1
            #print(num_episodes)
            reward_batch += reward_sum

        #print(num_episodes)
        reward_batch /= num_episodes
        batch = memory.sample()

        #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac)
        time.sleep(60)

        if i_episode % args.log_interval == 0:
            File = open(PATH_TO_MODEL + '/record.txt', 'a+')
            File.write("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            File.close()
            #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
            #    i_episode, reward_sum, reward_batch))
            print("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            #print('!!!!')

        epoch = i_episode
        if reward_batch > best_result:
            best_result = reward_batch
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, 'best')

        if epoch % 30 == 1:
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, epoch)
Ejemplo n.º 17
0
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\
        batch_size=5000, maximum_steps=10000, render=False,\
        seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2):
    env = gym.make(env_name)
    #Get number of inputs for A3CActor
    num_inputs = env.observation_space.shape[0]
    #Get number of outputs required for describing action
    num_actions = env.action_space.shape[0]
    env.seed(seed)
    torch.manual_seed(seed)

    actor_net = A3CActor(num_inputs, num_actions)
    actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001)

    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []
    plot_rew = []
    for i_episode in range(number_of_batches):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(maximum_steps):
                action = select_action(state, actor_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state, reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t-1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()
        plot_rew.append(reward_batch)
        update_params(batch, actor_net, actor_optimizer, gamma, tau, clip_epsilon)
        if i_episode % log_interval == 0:
            print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))

    plot_epi = []
    for i in range (number_of_batches):
        plot_epi.append(i)
    trace = go.Scatter( x = plot_epi, y = plot_rew)
    layout = go.Layout(title='A2C',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')))

    plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg')

    return