Ejemplo n.º 1
0
 def test_random_action():
     env = gym.make('gym_kinova_gripper:kinovagripper-v0')
     obs, done = env.reset(), False
     noise = OUNoise(3)
     max_action = float(env.action_space.high[0])
     correct = 0
     noise.reset()
     cum_reward = 0.0
     for i in range(100):
         finger_actions = noise.noise().clip(-max_action, max_action)
         # actions = np.array([0.0, finger_actions[0], finger_actions[1], finger_actions[2]])
         actions = np.array([0.4, 0.5, 0.5, 0.5])
         obs, reward, done, _ = env.step(actions)
         inputs = torch.FloatTensor(np.array(obs)).to(device)
def main():
    my_env = env()

    agent = NAF_CNN(0.99, 0.001, 128, my_env.observation_space.shape[0],
                    my_env.action_space)

    parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.3)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    args = parser.parse_args()

    ounoise = OUNoise(my_env.action_space.shape[0])
    ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
        0, args.exploration_end -
        1) / args.exploration_end + args.final_noise_scale
    ounoise.reset()

    state = my_env.reset()
    i = 10
    while i > 0:
        action = agent.select_action(state, ounoise)
        print("action: {}".format(action))
        next_state, reward, done = my_env.step(action)
        if done:
            break
        print(reward)
        i = i - 1
Ejemplo n.º 3
0
class DDPGB(object):
    # x是数值向量
    # b是码值向量
    # c是标准codebook矩阵
    # action_output_num是码值输出的维度
    # replay_size是meomery队列的最大长度
    # new_b代表新计算得到的b
    # env代表action和obersevation的产生环境
    # agent代表实际的ddpg执行体
    # 保留这些噪声参数只是为了能够进入到需要随机探索的部分
    def __init__(self,
                 C,
                 b,
                 x,
                 action_output_num,
                 actor_size,
                 replay_size=1000000,
                 ou_noise=True,
                 param_noise=True,
                 noise_scale=0.3,
                 final_noise_scale=0.3):
        self.C = C
        self.b = b
        self.x = x
        self.hd = action_output_num
        self.actor_size = actor_size
        self.memory = ReplayMemory(replay_size)
        self.new_b = None
        self.env = None
        self.agent = None
        self.ou_noise = ou_noise
        self.noise_scale = noise_scale
        self.final_noise_scale = final_noise_scale
        self.ounoise = OUNoise(action_output_num) if ou_noise else None
        self.param_noise = AdaptiveParamNoiseSpec(
            initial_stddev=0.05,
            desired_action_stddev=noise_scale,
            adaptation_coefficient=1.05) if param_noise else None

    def update_B(self, c, b, x):
        self.C = c
        self.b = b
        self.x = x

    # 备选coff代表reward中的权重比例[0.2, 0.8]
    def generate_B(self,
                   coff,
                   gamma,
                   tau,
                   hidden_size,
                   num_inputs,
                   actor_size,
                   num_episodes=60000,
                   exploration_end=150,
                   batch_size=512,
                   updates_per_step=5000):

        self.env = QuantizationEnv(self.C, self.b, self.x, self.hd, coff)
        self.agent = DDPG(gamma, tau, hidden_size, self.env.action_bin,
                          num_inputs, actor_size)
        rewards = []
        total_numsteps = 0
        updates = 0
        max_trail = 10000
        best_bb = 10000

        # 开启num_episodes次最佳方案寻找
        for i_episode in range(num_episodes):
            state = torch.Tensor([self.env.reset()])
            if self.ou_noise:
                self.ounoise.scale = (self.noise_scale - self.final_noise_scale) * max(0, exploration_end - i_episode) \
                                     / exploration_end + self.final_noise_scale
                self.ounoise.reset()
            if self.param_noise:
                self.agent.perturb_actor_parameters(self.param_noise)

            episode_reward = 0
            continuous_neg = 0
            continuous_pos = 0
            temp_trail = 0

            control_bit = 0
            next_state = self.env.compute_Cbx(self.b)
            next_state = torch.Tensor([next_state])

            while True:
                # yyj
                if control_bit > 15:
                    control_bit = control_bit % 16
                state = next_state
                action = self.agent.select_action(state, self.ounoise,
                                                  self.param_noise)
                next_state, reward, done, bb = self.env.step(
                    action, control_bit, self.actor_size)
                # print(control_bit, next_state[0], reward, done, bb)
                control_bit = control_bit + 1
                total_numsteps += 1
                episode_reward += reward
                # bb是c_v值
                if best_bb > bb:
                    best_bb = bb
                    self.new_b = action

                if reward > 0:
                    continuous_pos += 1
                    continuous_neg = 0
                    if continuous_pos > 10:
                        done = True
                if reward < 0:
                    continuous_neg += 1
                    continuous_pos = 0
                    if continuous_neg > 10:
                        done = True
                if temp_trail > max_trail:
                    done = True

                action = torch.Tensor(action)
                mask = torch.Tensor([not done])
                next_state = torch.Tensor([next_state])
                reward = torch.Tensor([reward])

                self.memory.push(state, action, mask, next_state, reward)

                # state = next_state
                temp_trail += 1

                # memorysize还不够,不会进入
                if len(self.memory) > batch_size:
                    for _ in range(updates_per_step):
                        transitions = self.memory.sample(1)
                        batch = Transition(*zip(*transitions))

                        # value_loss属于右边的网络,policy_loss属于左边的网络
                        value_loss, policy_loss = self.agent.update_parameters(
                            batch)
                        print("epoch:", i_episode, "updates", updates,
                              "value_loss:", value_loss, " policy_loss:",
                              policy_loss)

                        updates += 1
                if done:
                    break

            if self.param_noise:
                episode_transitions = self.memory.memory[self.memory.position -
                                                         batch_size:self.
                                                         memory.position]
                states = torch.cat(
                    [transition[0] for transition in episode_transitions], 0)
                unperturbed_actions = self.agent.select_action(
                    states, None, None)
                perturbed_actions = torch.cat(
                    [transition[1] for transition in episode_transitions], 0)

                ddpg_dist = ddpg_distance_metric(perturbed_actions.numpy(),
                                                 unperturbed_actions.numpy())
                self.param_noise.adapt(ddpg_dist)

            rewards.append(episode_reward)
            continuous_neg = 0
            continuous_pos = 0
            temp_trail = 0
            if i_episode % 10 == 0 and i_episode != 0:
                state = torch.Tensor([self.env.reset()])
                episode_reward = 0
                control_bit = 0
                while True:
                    action = self.agent.select_action(state)
                    next_state, reward, done, bb = self.env.step(
                        action.numpy()[0], control_bit)
                    episode_reward += reward
                    if best_bb > bb:
                        best_bb = bb
                        self.new_b = action

                    if reward > 0:
                        continuous_pos += 1
                        continuous_neg = 0
                        if continuous_pos > 10:
                            done = True
                    if reward < 0:
                        continuous_neg += 1
                        continuous_pos = 0
                        if continuous_neg > 10:
                            done = True
                    if temp_trail > max_trail:
                        done = True

                    next_state = torch.Tensor([next_state])

                    state = next_state
                    temp_trail += 1

                    if done:
                        break

                rewards.append(episode_reward)
                print(
                    "Episode: {}, total numsteps: {}, reward: {}, average reward: {}"
                    .format(i_episode, total_numsteps, rewards[-1],
                            np.mean(rewards[-10:])))

        return self.new_b
Ejemplo n.º 4
0
def fit_nash():
    suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode)
    # reward_file = open('reward' + suffix + '.txt', 'w')
    # attack_file = open('attacker_action' + suffix + '.txt', 'w')
    # weight_file = open('vehicle_weight' + suffix + '.txt', 'w')
    # distance_file = open('Distance' + suffix + '.txt', 'w')



#     reward_file.write("""
# Environment Initializing...
# The initial head car velocity is {}
# The initial safe distance is     {}
# The Nash Eq* Factor RC is        {}
# The Reward Calculation Mode is   {}
# The Attack Mode is               {}
# The Nash Mode is                 {}
# """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash))

    # reward_file.close()
    # attack_file.close()
    # weight_file.close()
    # distance_file.close()

    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space, 'veh')
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space, 'att')
    try:
        agent_vehicle.load_model('models/vehicle_' + suffix)
        print('Load vehicle RL model successfully')

    except:
        print('No existed vehicle RL model')
    try:
        agent_attacker.load_model('models/attacker_' + suffix)
        print('Load attacker RL model successfully')

    except:
        print('No existed attacker RL model')
    try:
        policy_vehicle = load_model('models/vehicle_' + suffix + '.h5')
        print('Load vehicle SL model successfully')
    except:
        policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle')
    try:
        policy_attacker = load_model('models/attacker_' + suffix + '.h5')
        print('Load attacker SL model successfully')
    except:
        policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker')
    print('*'*20, '\n\n\n')
    memory_vehicle = ReplayMemory(100000)
    memory_attacker = ReplayMemory(100000)

    memory_SL_vehicle = ReplayMemory(400000)
    memory_SL_attacker = ReplayMemory(400000)

    ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                 desired_action_stddev=args.noise_scale,
                                                 adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                  desired_action_stddev=args.noise_scale,
                                                  adaptation_coefficient=1.05) if args.param_noise else None
    res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance'])
    reward_data = pd.DataFrame(columns=['Reward'])
    rewards = []
    total_numsteps = 0
    for i_episode in range(args.num_episodes):
        if i_episode % 100 == 0 and i_episode != 0:
            print('Writing to CSV files...')
            reward_data.to_csv(suffix + '.csv', index=False)
            res_data.to_csv(suffix + '.csv', index=False)

        if args.NashMode == 0:
            ETA = 0
        elif args.NashMode == 1:
            ETA = 0.5
        elif args.NashMode == 2:
            ETA = 0.1 - i_episode/args.num_episodes * 0.1

        print('No.{} episode starts... ETA is {}'.format(i_episode, ETA))

        # reward_file = open('reward' + suffix + '.txt', 'a')
        # attack_file = open('attacker_action' + suffix + '.txt', 'a')
        # weight_file = open('vehicle_weight' + suffix + '.txt', 'a')
        # distance_file = open('Distance' + suffix + '.txt', 'a')

        local_steps = 0
        state = env.reset()
        state_record = [np.array([state])]
        episode_steps = 0
        while len(state_record) < 20:
            a, b = env.random_action()
            s, _, _ = env.step(np.array([a]), np.zeros(4))
            local_steps += 1
            state_record.append(s)
        if args.ou_noise:
            ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_vehicle.reset()

            ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                       i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_attacker.reset()
        episode_reward = 0
        local_steps = 0
        while True:
            sigma = random.random()
            if sigma > ETA:
                # print(state_record[-20:])
                # print('rl', torch.Tensor(state_record[-20:]).shape)
                action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                             param_noise_vehicle)[:, -1, :]
                # print('rl', action_vehicle.shape)
                action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                               param_noise_attacker)[:, -1, :]
                # print('rl', action_vehicle.shape)
            else:
                action_vehicle = torch.Tensor(
                    [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                action_attacker = torch.Tensor(
                    [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]

            # 限制权重和为1
            action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum())
            action_attacker = action_attacker.numpy()[0]

            next_state, reward, done = env.step(action_vehicle, action_attacker)
            res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}])

            # 将处理的攻击值赋给原值
            action_attacker = env.action_attacker

            total_numsteps += 1
            episode_reward += reward

            state_record.append(next_state)
            local_steps += 1
            episode_steps += 1

            if sigma > ETA:
                memory_SL_vehicle.append(state_record[-1], action_vehicle)
                memory_SL_attacker.append(state_record[-1], action_attacker)

            action_vehicle = torch.Tensor(action_vehicle.reshape(1,4))
            action_attacker = torch.Tensor(action_attacker.reshape(1,4))

            mask = torch.Tensor([not done])
            prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1)
            next_state = torch.Tensor([next_state])

            reward_vehicle = torch.Tensor([reward])
            reward_attacker = torch.Tensor([RC - reward])

            memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle)
            memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker)

            if done:
                rewards.append(episode_reward)
                print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward))
                reward_data = reward_data.append([{'Reward': episode_reward}])
                    # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward))
                break

        if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size:  # 开始训练
            for _ in range(args.updates_per_step):
                transitions_vehicle = memory_vehicle.sample(args.batch_size)
                batch_vehicle = Transition(*zip(*transitions_vehicle))

                transitions_attacker = memory_attacker.sample(args.batch_size)
                batch_attacker = Transition(*zip(*transitions_attacker))

                trans_veh = memory_SL_vehicle.sample(args.batch_size)
                trans_att = memory_SL_attacker.sample(args.batch_size)

                states_veh = []
                actions_veh = []
                states_att = []
                actions_att = []
                for sample in trans_veh:
                    state_veh, act_veh = sample
                    states_veh.append(state_veh)
                    actions_veh.append(act_veh)
                for sample in trans_att:
                    state_att, act_att = sample
                    states_att.append(state_att)
                    actions_att.append(act_att)

                states_veh = np.reshape(states_veh, (-1, env.observation_space))
                states_att = np.reshape(states_att, (-1, env.observation_space))
                actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space))
                actions_att = np.reshape(actions_att, (-1, env.attacker_action_space))

                policy_vehicle.fit(states_veh, actions_veh, verbose=False)
                policy_attacker.fit(states_att, actions_att, verbose=False)
                agent_vehicle.update_parameters(batch_vehicle)
                agent_attacker.update_parameters(batch_attacker)

                # writer.add_scalar('loss/value', value_loss, updates)
                # writer.add_scalar('loss/policy', policy_loss, updates)

        if i_episode % 10 == 0 and i_episode != 0:

            eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance'])


            # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode))
            state = env.reset()
            state_record = [np.array([state])]
            evaluate_reward = 0
            while len(state_record) < 20:
                a, b = env.random_action()
                s, _, _ = env.step(np.array([a]), np.zeros(4))
                local_steps += 1
                state_record.append(s)
            while True:
                if random.random() < ETA:
                    action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                                 param_noise_vehicle)[:, -1, :]
                    # print('rl', action_vehicle.shape)
                    action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                                   param_noise_attacker)[:, -1, :]
                else:
                    action_vehicle = torch.Tensor(
                        [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                            state_record[-1].reshape(-1, 4)).sum()])[0]
                    action_attacker = torch.Tensor(
                        [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0]

                action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum()
                action_attacker = action_attacker.numpy()[0]
                next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2)

                eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}])
                evaluate_reward += reward


                if done:
                    print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode,
                                                                                                   total_numsteps,
                                                                                                   evaluate_reward,
                                                                                                   np.mean(rewards[-10:])))
                    # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode,
                    #                                                                                              total_numsteps,
                    #                                                                                              evaluate_reward,
                    #                                                                                              np.mean(rewards[-10:])))
                    break
        #         # writer.add_scalar('reward/test', episode_reward, i_episode)
        # reward_file.close()
        # attack_file.close()
        # weight_file.close()
        # distance_file.close()
    env.close()
    reward_data.to_csv(suffix+'_reward.csv', index=False)
    res_data.to_csv(suffix+'.csv', index=False)
    eva_res_data.to_csv(suffix+'_eva.csv', index=False)

    # save model
    agent_vehicle.save_model('vehicle_'+suffix)
    agent_attacker.save_model('attacker_'+suffix)

    policy_attacker.save('models/attacker_'+suffix+'.h5')
    policy_vehicle.save('models/vehicle_'+suffix+'.h5')
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 num_all_agents,
                 seed,
                 batch_size,
                 buffer_size=int(1e6),
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=4e-4,
                 lr_critic=4e-4,
                 weight_decay=0,
                 discrete_actions=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_all_agents (int): number of agents
            seed (int): random seed
            batch_size (int): minibatch size
            buffer_size (int): replay buffer size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr_actor (float): learning rate of the actor
            lr_critic (float): learning rate of the critic
            weight_decay (float): L2 weight decay
        """
        random.seed(seed)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.num_all_agents = num_all_agents
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        self.noise = OUNoise(action_size, seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed,
                                 use_batch_norm_layers=False).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed,
                                  use_batch_norm_layers=False).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        if discrete_actions:
            action_size = 1
        self.critic_local = Critic(state_size * num_all_agents,
                                   action_size * num_all_agents,
                                   seed).to(device)
        self.critic_target = Critic(state_size * num_all_agents,
                                    action_size * num_all_agents,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, tau, agent_index):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states_all, actions_all, rewards_all, next_states_all, dones, actions_next_target_all, actions_next_local_all = experiences

        rewards_self = rewards_all[:, agent_index]
        states_self = states_all.view(-1, self.num_all_agents,
                                      self.state_size)[:, agent_index, :]
        del rewards_all

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(next_states_all,
                                            actions_next_target_all)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards_self + (gamma * Q_targets_next) * (1 - dones)
        # Compute critic loss
        Q_expected = self.critic_local(states_all, actions_all)
        critic_loss = F.mse_loss(Q_expected.view(-1, self.batch_size),
                                 Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actor_loss = -self.critic_local(states_all,
                                        actions_next_local_all).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, tau)
        self.soft_update(self.actor_local, self.actor_target, tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 6
0
def main():
    global subdata
    t_start = time.time()

    parser = argparse.ArgumentParser(description='PyTorch X-job')
    parser.add_argument('--env_name',
                        default="OurEnv-v0",
                        help='name of the environment')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        help='discount factor for model (default: 0.001)')
    parser.add_argument('--ou_noise', type=bool, default=True)
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.4,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.4)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=33,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    parser.add_argument('--seed',
                        type=int,
                        default=4,
                        metavar='N',
                        help='random seed (default: 4)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=512,
                        metavar='N',
                        help='batch size (default: 512)')
    parser.add_argument('--num_steps',
                        type=int,
                        default=300,
                        metavar='N',
                        help='max episode length (default: 1000)')
    parser.add_argument('--num_episodes',
                        type=int,
                        default=50,
                        metavar='N',
                        help='number of episodes (default: 1000)')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='hidden size (default: 128)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--save_agent',
                        type=bool,
                        default=True,
                        help='save model to file')
    parser.add_argument('--load_agent',
                        type=bool,
                        default=False,
                        help='load model from file')
    parser.add_argument('--train_model',
                        type=bool,
                        default=True,
                        help='Training or run')
    parser.add_argument('--load_exp',
                        type=bool,
                        default=False,
                        help='load saved experience')
    parser.add_argument('--state_plot',
                        type=bool,
                        default=True,
                        help='plot Q values for environment')
    parser.add_argument('--greedy_steps',
                        type=int,
                        default=5,
                        metavar='N',
                        help='amount of times greedy goes (default: 100)')

    args = parser.parse_args()

    #env = gym.make(args.env_name)

    env = Env()

    #env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # -- initialize agent, Q and Q' --
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                env.observation_space.shape[0], env.action_space)

    # -- declare memory buffer and random process N
    memory = ReplayMemory(args.replay_size)
    memory_g = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None

    # -- load existing model --
    if args.load_agent:
        agent.load_model(args.env_name, args.batch_size, '.pth')
        print("agent: naf_{}_{}_{}, is loaded").format(args.env_name,
                                                       args.batch_size, '.pth')
    # -- load experience buffer --
    if args.load_exp:
        with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1',
                  'rb') as input:
            memory.memory = pickle.load(input)
            memory.position = len(memory)

    #sate_Q_plot(agent, 50)

    rewards = []
    total_numsteps = 0
    greedy_reward = []
    avg_greedy_reward = []
    upper_reward = []
    lower_reward = []
    steps_to_goal = []
    avg_steps_to_goal = []
    state_plot = []

    sim_reset_start()

    pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10)
    rospy.Subscriber("/ee_rl/state", StateMsg, callback)
    rate = rospy.Rate(9)
    rate.sleep()

    for i_episode in range(args.num_episodes + 1):
        # -- reset environment for every episode --
        sim_reset()
        state = torch.Tensor(subdata).unsqueeze(0)

        # -- initialize noise (random process N) --
        if args.ou_noise:
            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end - i_episode / args.exploration_end +
                args.final_noise_scale)
            ounoise.reset()

        episode_reward = 0

        while True:
            # -- action selection, observation and store transition --
            action = agent.select_action(
                state,
                ounoise) if args.train_model else agent.select_action(state)
            a = action.numpy()[0] * 50
            act_pub = [a[0], a[1]]
            pub.publish(act_pub)
            next_state = torch.Tensor(subdata).unsqueeze(0)
            reward, done, _ = env.calc_shaped_reward(next_state)

            total_numsteps += 1
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])

            memory.push(state, action, mask, next_state, reward)
            # if done:
            #     for i in range(total_numsteps % args.num_steps):
            #         a = i+1
            #         memory_g.memory.append(memory.memory[-a])
            #         memory_g.position += 1

            state = next_state

            #-- training --
            # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model:
            #     for _ in range(10):
            #         transitions_b = memory.sample(args.batch_size/2)
            #         transitions_g = memory_g.sample(args.batch_size/2)
            #         for i in range(transitions_g):
            #             transitions_b.append(transitions_g[i])
            #         batch = Transition(*zip(*transitions_b))
            #         agent.update_parameters(batch)

            if len(memory) > args.batch_size and args.train_model:
                for _ in range(10):
                    transitions = memory.sample(args.batch_size)
                    batch = Transition(*zip(*transitions))
                    agent.update_parameters(batch)

            else:
                time.sleep(0.1)
            rate.sleep()

            if done or total_numsteps % args.num_steps == 0:
                break

        pub.publish([0, 0])
        rewards.append(episode_reward)

        # -- plot Q value --
        if i_episode % 10 == 0:

            sate_Q_plot(agent, i_episode)
            # -- saves model --
            if args.save_agent:
                agent.save_model(args.env_name, args.batch_size, i_episode,
                                 '.pth')
                with open('exp_replay.pk1', 'wb') as output:
                    pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
                #with open('exp_replay_g.pk1', 'wb') as output:
                #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

        if args.train_model:
            greedy_episode = max(args.num_episodes / 100, 5)
        else:
            greedy_episode = 10
        greedy_range = min(args.greedy_steps, greedy_episode)

        # -- calculates episode without noise --
        if i_episode % greedy_episode == 0 and not i_episode == 0:
            for _ in range(0, greedy_range + 1):
                # -- reset environment for every episode --
                sim_reset()
                state_visited = []
                action_taken = []
                print("Greedy episode ongoing")

                state = torch.Tensor(subdata).unsqueeze(0)
                episode_reward = 0
                steps = 0

                state_plot.append([])
                st = state.numpy()[0]
                sta = [st[0], st[1]]
                state_plot[_].append(sta)

                while True:
                    action = agent.select_action(state)
                    a = action.numpy()[0] * 50
                    act_pub = [a[0], a[1]]
                    pub.publish(act_pub)
                    next_state = torch.Tensor(subdata).unsqueeze(0)
                    reward, done, obs_hit = env.calc_shaped_reward(next_state)
                    episode_reward += reward

                    state_visited.append(state)
                    action_taken.append(action)

                    state = next_state

                    steps += 1
                    if done or steps == args.num_steps:
                        greedy_reward.append(episode_reward)
                        break
                    rate.sleep()

                if obs_hit:
                    steps = 300

                steps_to_goal.append(steps)

                # -- plot path --
                if i_episode % 10 == 0:
                    agent.plot_path(state_visited, action_taken, i_episode)

            upper_reward.append((np.max(greedy_reward[-greedy_range:])))
            lower_reward.append((np.min(greedy_reward[-greedy_range:])))
            avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:])))
            avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:])))

            print(
                "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}"
                .format(i_episode, total_numsteps, avg_greedy_reward[-1],
                        np.mean(rewards[-greedy_episode:])))

    #-- saves model --
    if args.save_agent:
        agent.save_model(args.env_name, args.batch_size, i_episode, '.pth')
        with open('exp_replay.pk1', 'wb') as output:
            pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
        #with open('exp_replay_g.pk1', 'wb') as output:
        #    pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

    print('Training ended after {} minutes'.format(
        (time.time() - t_start) / 60))
    print('Time per ep : {} s').format(
        (time.time() - t_start) / args.num_episodes)
    print('Mean greedy reward: {}'.format(np.mean(greedy_reward)))
    print('Mean reward: {}'.format(np.mean(rewards)))
    print('Max reward: {}'.format(np.max(rewards)))
    print('Min reward: {}'.format(np.min(rewards)))

    # -- plot learning curve --
    pos_greedy = []
    for pos in range(0, len(lower_reward)):
        pos_greedy.append(pos * greedy_episode)

    plt.title('Greedy policy outcome')
    plt.fill_between(pos_greedy,
                     lower_reward,
                     upper_reward,
                     facecolor='red',
                     alpha=0.3)
    plt.plot(pos_greedy, avg_greedy_reward, 'r')
    plt.xlabel('Number of episodes')
    plt.ylabel('Rewards')
    fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname1)
    plt.close()

    plt.title('Steps to reach goal')
    plt.plot(steps_to_goal)
    plt.ylabel('Number of steps')
    plt.xlabel('Number of episodes')
    fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname2)
    plt.close()
Ejemplo n.º 7
0
class Agent(object):
    def __init__(self, state_space, action_space, max_action, device):
        self.state_size = state_space.shape[0]
        self.action_size = action_space.shape[0]
        self.max_action = max_action
        self.device = device
        self.actor_local = Actor(state_space.shape, action_space.high.size,
                                 max_action)
        self.actor_target = Actor(state_space.shape, action_space.high.size,
                                  max_action)
        self.actor_optimizer = optimizers.Adam(LR_ACTOR)
        # let target be equal to local
        self.actor_target.set_weights(self.actor_local.get_weights())

        self.critic_local = Critic(state_space.shape, action_space.high.size)
        self.critic_target = Critic(state_space.shape, action_space.high.size)
        self.critic_optimizer = optimizers.Adam(LR_CRITIC)
        # let target be equal to local
        self.critic_target.set_weights(self.critic_local.get_weights())

        self.noise = OUNoise(self.action_size)
        self.memory = ReplayBuffer(BUFFER_SIZE)

        self.current_steps = 0

    def step(self,
             state,
             action,
             reward,
             done,
             next_state,
             train=True) -> None:
        self.memory.store(state, action, reward, done, next_state)
        if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE:
            if self.current_steps % UPDATE_STEPS == 0:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)
            self.current_steps += 1

    @tf.function
    def critic_train(self, states, actions, rewards, dones, next_states):
        with tf.device(self.device):
            # Compute yi
            u_t = self.actor_target(next_states)
            q_t = self.critic_target([next_states, u_t])
            yi = tf.cast(rewards, dtype=tf.float64) + \
                 tf.cast(GAMMA, dtype=tf.float64) * \
                 tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \
                 tf.cast(q_t, dtype=tf.float64)

            # Compute MSE
            with tf.GradientTape() as tape:
                q_l = tf.cast(self.critic_local([states, actions]),
                              dtype=tf.float64)
                loss = (q_l - yi) * (q_l - yi)
                loss = tf.reduce_mean(loss)
                # Update critic by minimizing loss
                dloss_dql = tape.gradient(loss,
                                          self.critic_local.trainable_weights)
            self.critic_optimizer.apply_gradients(
                zip(dloss_dql, self.critic_local.trainable_weights))
        return

    @tf.function
    def actor_train(self, states):
        with tf.device(self.device):
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(self.actor_local.trainable_variables)
                u_l = self.actor_local(states)
                q_l = -tf.reduce_mean(self.critic_local([states, u_l]))
            j = tape.gradient(q_l, self.actor_local.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(j, self.actor_local.trainable_variables))
        return

    def learn(self, experiences, gamma) -> None:
        states, actions, rewards, dones, next_states = experiences
        states = np.array(states).reshape(BATCH_SIZE, self.state_size)
        states = tf.convert_to_tensor(states)
        actions = np.array(actions).reshape(BATCH_SIZE, self.action_size)
        actions = tf.convert_to_tensor(actions)
        rewards = np.array(rewards).reshape(BATCH_SIZE, 1)
        next_states = np.array(next_states).reshape(BATCH_SIZE,
                                                    self.state_size)
        dones = np.array(dones).reshape(BATCH_SIZE, 1)

        self.critic_train(states, actions, rewards, dones, next_states)
        self.actor_train(states)
        self.update_local()
        return

    def update_local(self):
        def soft_updates(local_model: tf.keras.Model,
                         target_model: tf.keras.Model) -> np.ndarray:
            local_weights = np.array(local_model.get_weights())
            target_weights = np.array(target_model.get_weights())

            assert len(local_weights) == len(target_weights)
            new_weights = TAU * local_weights + (1 - TAU) * target_weights
            return new_weights

        self.actor_target.set_weights(
            soft_updates(self.actor_local, self.actor_target))
        self.critic_target.set_weights(
            soft_updates(self.critic_local, self.critic_target))

    def store_weights(self, episode: int) -> None:
        self.actor_target.save_weights(
            join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}'))
        self.critic_target.save_weights(
            join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}'))
        return

    def act(self, state, add_noise=True) -> (float, float):
        state = np.array(state).reshape(1, self.state_size)
        pure_action = self.actor_local.predict(state)[0]
        action = self.noise.get_action(pure_action)
        return action, pure_action

    def reset(self):
        self.noise.reset()
Ejemplo n.º 8
0
class Agent:
    """Interacts and learns from the environment"""
    def __init__(self,
                 device,
                 state_size,
                 action_size,
                 random_seed,
                 fc1=128,
                 fc2=128,
                 lr_actor=1e-04,
                 lr_critic=1e-04,
                 weight_decay=0,
                 buffer_size=100000,
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3):
        """
        Parameters
        ----------
            brain_name (String):
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            fc1 (int): 1st fully connected layer size for model (actor & critic)
            fc2 (int): 2nd fully connected layer size for model (actor & critic)
            device: CPU/GPU

            lr_actor (float): learning rate for Actor
            lr_critic (flaot): learning rate for Critic
            weight_decay (float): weight decay used in model optimizer
            buffer_size (int): replay buffer size
            batch_size (int): batch size to sample from buffer
            gamma (float): parameter used to calculate Q target
            tau (float): soft update interpolation parameter
        """
        self.device = device

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        # Actor network (with target)
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 random_seed,
                                 fc1_units=fc1,
                                 fc2_units=fc2).to(device)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  random_seed,
                                  fc1_units=fc1,
                                  fc2_units=fc2).to(device)
        self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic actor
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   random_seed,
                                   fc1_units=fc1,
                                   fc2_units=fc2).to(device)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    random_seed,
                                    fc1_units=fc1,
                                    fc2_units=fc2).to(device)
        self.critic_optimizer = optim.Adam(
            params=self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   self.device, random_seed)

        self.make_copy(self.critic_local, self.critic_target)
        self.make_copy(self.actor_local, self.actor_target)

        print("Initilized agent with state size = {} and action size = {}".
              format(self.state_size, self.action_size))

    def make_copy(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use random sample from buffer to learn
        """

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            batch = self.memory.sample()
            self.learn(batch)

    def act(self, state, add_noise=True):
        """
        Returns actions for given state as per current policy.
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, batch):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Parameters
        ----------
            batch (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = batch

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_target_next = self.critic_target(next_states, actions_next)
        # compute Q targets for next states (y_i)
        Q_targets = rewards + (self.gamma * Q_target_next * (1.0 - dones))
        # Compute citic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimise loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimise loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Parameters
        ----------
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
def fit_nash():
    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space)
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space)

    policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space)
    policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space)

    memory_vehicle = ReplayMemory(1000000)
    memory_attacker = ReplayMemory(1000000)

    memory_SL_vehicle = ReplayMemory(100000)
    memory_SL_attacker = ReplayMemory(100000)

    ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                 desired_action_stddev=args.noise_scale,
                                                 adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                  desired_action_stddev=args.noise_scale,
                                                  adaptation_coefficient=1.05) if args.param_noise else None

    rewards = []
    eva_reward = []
    ave_reward = []
    eva_ac_veh = []
    eva_ac_att = []
    total_numsteps = 0
    updates = 0
    # while len(state_record) < 20:
    #     s, _, _ = env.step(env.random_action())
    #     state_record.append(s)
    for i_episode in range(args.num_episodes):
        state = env.reset()
        if args.ou_noise:
            ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_vehicle.reset()

            ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                       i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_attacker.reset()
        episode_reward = 0
        while True:
            if random.random() < ETA:
                action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle,
                                                             param_noise_vehicle)
                action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker,
                                                               param_noise_attacker)
            else:
                action_vehicle = torch.Tensor(
                    [policy_vehicle.predict(state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()])
                action_attacker = torch.Tensor([policy_attacker.predict(state.reshape(-1, 4)) / policy_attacker.predict(
                    state.reshape(-1, 4)).sum()])
            if is_cuda:
                ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0]
            else:
                ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0]
            next_state, reward, done = env.step(ac_v, ac_a)
            total_numsteps += 1
            episode_reward += reward

            memory_SL_vehicle.append(state, ac_v)
            memory_SL_attacker.append(state, ac_a)

            action_vehicle = torch.Tensor(action_vehicle)
            action_attacker = torch.Tensor(action_attacker)

            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])

            reward_vehicle = torch.Tensor([reward])
            reward_attacker = torch.Tensor([env.RC - reward])
            memory_vehicle.push(torch.Tensor([[state]]), action_vehicle, mask, next_state, reward_vehicle)
            memory_attacker.push(torch.Tensor([[state]]), action_attacker, mask, next_state, reward_attacker)

            state = next_state.numpy()[0][0]

            if done:
                rewards.append(episode_reward)
                if i_episode % 100:
                    print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward))
                break

        if len(memory_vehicle) > args.batch_size:  # 开始训练
            # print('begin training')
            for _ in range(args.updates_per_step):
                transitions_vehicle = memory_vehicle.sample(args.batch_size)
                batch_vehicle = Transition(*zip(*transitions_vehicle))

                transitions_attacker = memory_attacker.sample(args.batch_size)
                batch_attacker = Transition(*zip(*transitions_attacker))

                trans_veh = memory_SL_vehicle.sample(args.batch_size)
                trans_att = memory_SL_attacker.sample(args.batch_size)

                states_veh = []
                actions_veh = []
                states_att = []
                actions_att = []
                for sample in trans_veh:
                    state_veh, act_veh = sample
                    states_veh.append(state_veh)
                    actions_veh.append(act_veh)
                for sample in trans_att:
                    state_att, act_att = sample
                    states_att.append(state_att)
                    actions_att.append(act_att)

                states_veh = np.reshape(states_veh, (-1, env.observation_space))
                states_att = np.reshape(states_att, (-1, env.observation_space))
                actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space))
                actions_att = np.reshape(actions_att, (-1, env.attacker_action_space))

                policy_vehicle.fit(states_veh, actions_veh, verbose=False)
                policy_attacker.fit(states_att, actions_att, verbose=False)
                value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle)
                value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker)

                # writer.add_scalar('loss/value', value_loss, updates)
                # writer.add_scalar('loss/policy', policy_loss, updates)

                updates += 1

        if i_episode % 10 == 0:
            state = env.reset()
            evaluate_reward = 0
            while True:
                if random.random() < ETA:
                    action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle,
                                                                 param_noise_vehicle)
                    action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker,
                                                                   param_noise_attacker)
                else:
                    action_vehicle = torch.Tensor([policy_vehicle.predict(
                        state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()])
                    action_attacker = torch.Tensor([policy_attacker.predict(
                        state.reshape(-1, 4)) / policy_attacker.predict(state.reshape(-1, 4)).sum()])
                if is_cuda:
                    ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0]
                else:
                    ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0]
                next_state, reward, done = env.step(ac_v, ac_a)
                total_numsteps += 1
                evaluate_reward += reward

                state = next_state[0]
                if done:
                    average_reward = np.mean(rewards[-10:])
                    print("{} % Episode finished, total numsteps: {}, reward: {}, average reward: {}".format(
                        i_episode / args.num_episodes * 100,
                        total_numsteps,
                        evaluate_reward,
                        average_reward))
                    eva_reward.append(evaluate_reward)
                    ave_reward.append(average_reward)
                    print(ac_v[0])
                    eva_ac_veh.append((ac_v[0] + 1) / sum(ac_v[0] + 1))
                    eva_ac_att.append((ac_a[0] + 1) / sum(ac_a[0] + 1))
                    break
            # writer.add_scalar('reward/test', episode_reward, i_episode)
    env.close()
    f = plt.figure()
    plt.plot(eva_reward, label='Eva_reward')
    plt.plot(ave_reward, label='Tra_ave_reward')
    plt.legend()
    plt.show()
    AC_veh = np.array(eva_ac_veh)
    AC_att = np.array(eva_ac_att)
    # print(AC_veh.shape)
    # print(AC_veh)
    plt.plot(AC_veh[:, 0], label='Bacon1')
    plt.plot(AC_veh[:, 1], label='Bacon2')
    plt.plot(AC_veh[:, 2], label='Bacon3')
    plt.plot(AC_veh[:, 3], label='Bacon4')
    # plt.plot(ave_reward, label='Tra_ave_reward')
    plt.legend()
    plt.savefig('Veh_result.png', ppi=300)
    plt.show()
Ejemplo n.º 10
0
class DDPG():
    """DDPG agent with own actor and critic."""
    def __init__(self, agent_id, model, action_size=2, seed=0):
        """Initialize an Agent object.
        """
        self.seed = random.seed(seed)
        self.id = agent_id
        self.action_size = action_size

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Set weights for local and target actor, respectively, critic the same
        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, seed)

    def hard_copy_weights(self, target, source):
        """ copy weights from source to target network (part of initialization)"""
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def act(self, state, noise_weight=1.0, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            self.noise_val = self.noise.sample() * noise_weight
            action += self.noise_val
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, agent_id, experiences, gamma, all_next_actions,
              all_actions):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # get predicted next-state actions and Q values from target models
        self.critic_optimizer.zero_grad()
        agent_id = torch.tensor([agent_id]).to(device)
        actions_next = torch.cat(all_next_actions, dim=1).to(device)
        with torch.no_grad():
            q_targets_next = self.critic_target(next_states, actions_next)
        # compute Q targets for current states (y_i)
        q_expected = self.critic_local(states, actions)
        # q_targets = reward of this timestep + discount * Q(st+1,at+1) from target network
        q_targets = rewards.index_select(
            1, agent_id) + (gamma * q_targets_next *
                            (1 - dones.index_select(1, agent_id)))
        # compute critic loss
        critic_loss = F.mse_loss(q_expected, q_targets.detach())
        # minimize loss
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # compute actor loss
        self.actor_optimizer.zero_grad()
        # detach actions from other agents
        actions_pred = [
            actions if i == self.id else actions.detach()
            for i, actions in enumerate(all_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # minimize loss
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 11
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 random_seed=0):
        """
            Initialize an Agent object.
        :param state_size: size of state
        :param action_size: size of action
        :param num_agents: number of agents
        :param gamma: discount factor
        :param tau: factor for soft update of target parameters
        :param lr_actor: Learning rate of actor
        :param lr_critic: Learning rate of critic
        :param random_seed: Random seed
        :param device: cuda or cpu
        """

        self.device = device
        self.gamma = gamma
        self.tau = tau

        self.num_agents = num_agents

        self.state_size = state_size
        self.action_size = action_size
        self.full_state_size = state_size * num_agents
        self.full_action_size = action_size * num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, device,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, device,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.full_state_size,
                                   self.full_action_size,
                                   device=device,
                                   random_seed=random_seed).to(device)
        self.critic_target = Critic(self.full_state_size,
                                    self.full_action_size,
                                    device=device,
                                    random_seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

    def save_model(self, agent_number):
        torch.save(self.actor_local.state_dict(),
                   f'models/checkpoint_actor_{agent_number}.pth')
        torch.save(self.critic_local.state_dict(),
                   f'models/checkpoint_critic_{agent_number}.pth')

    def load_model(self, agent_number):
        checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.actor_local.load_state_dict(checkpoint)

        checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.critic_local.load_state_dict(checkpoint)

    def act(self, state, noise=0., train=False):
        """Returns actions for given state as per current policy.
        :param state: state as seen from single agent
        """

        if train is True:
            self.actor_local.train()
        else:
            self.actor_local.eval()

        action = self.actor_local(state)
        if noise > 0:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def target_act(self, state, noise=0.):
        #self.actor_target.eval()
        # convert to cpu() since noise is in cpu()
        self.actor_target.eval()
        action = self.actor_target(state).cpu()
        if noise > 0.:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def update_critic(self, rewards, dones, all_states, all_actions,
                      all_next_states, all_next_actions):
        with torch.no_grad():
            Q_targets_next = self.critic_target(all_next_states,
                                                all_next_actions)
            # Compute Q targets for current states (y_i)
        q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        q_expected = self.critic_local(all_states, all_actions)
        # critic_loss = F.mse_loss(q_expected, q_targets)
        critic_loss = ((q_expected - q_targets.detach())**2).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, all_states, all_predicted_actions):
        """Update actor network

        :param all_states: all states
        :param all_predicted_actions: all predicted actions
        """
        actor_loss = -self.critic_local(all_states,
                                        all_predicted_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_optimizer.step()

    def update_targets(self):
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        self.soft_update(self.critic_local, self.critic_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        self.noise.reset()
Ejemplo n.º 12
0
def fit_nash():
    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space)
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space)

    policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space)
    policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space)

    memory_vehicle = ReplayMemory(1000000)
    memory_attacker = ReplayMemory(1000000)

    memory_SL_vehicle = ReplayMemory(100000)
    memory_SL_attacker = ReplayMemory(100000)

    ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                 desired_action_stddev=args.noise_scale,
                                                 adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                  desired_action_stddev=args.noise_scale,
                                                  adaptation_coefficient=1.05) if args.param_noise else None

    rewards = []
    eva_reward = []
    ave_reward = []
    tra_ac_veh = []
    tra_ac_att = []
    All_reward=[]
    total_numsteps = 0
    updates = 0
    state_record = [env.reset()]
    # while len(state_record) < 20:
    #     s, _, _ = env.step(*env.random_action())
    #     state_record.append(s)
    # print(torch.Tensor([state_record[-20:]]).shape)
    for i_episode in range(args.num_episodes):
        local_steps = 0
        state = env.reset()
        state_record = [np.array([state])]
        episode_steps = 0
        while len(state_record) < 20:
            a, b = env.random_action()
            s, _, _ = env.step(np.array([a]), np.array([b]))
            local_steps += 1
            state_record.append(s)
        if args.ou_noise:
            ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_vehicle.reset()

            ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                       i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_attacker.reset()
        episode_reward = 0
        local_steps = 0
        while True:
            if random.random() < ETA:
                # print(state_record[-20:])
                # print('rl', torch.Tensor(state_record[-20:]).shape)
                action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                             param_noise_vehicle)[:, -1, :]
                # print('rl', action_vehicle.shape)
                action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                               param_noise_attacker)[:, -1, :]
                # print('rl', action_vehicle.shape)
            else:
                action_vehicle = torch.Tensor(
                    [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                action_attacker = torch.Tensor(
                    [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                # print('sl', action_vehicle.shape)
                # print('sl', action_vehicle.shape)
            if is_cuda:
                ac_v, ac_a = action_vehicle.cpu().numpy(), action_attacker.cpu().numpy()[0]
            else:
                ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy()
            next_state, reward, done = env.step(ac_v, ac_a)
            # print('tra_reward', reward)
            # print(np.shape(state_record), next_state[0].shape)
            state_record.append(next_state)
            local_steps += 1
            total_numsteps += 1
            episode_steps += 1
            episode_reward += reward
            # print('sl-mem',state.shape,ac_v.shape)
            # print('sl state mem', state.shape, ac_a.shape)
            memory_SL_vehicle.append(state_record[-1], ac_v)
            memory_SL_attacker.append(state_record[-1], ac_a)

            action_vehicle = torch.Tensor(action_vehicle)
            action_attacker = torch.Tensor(action_attacker)

            mask = torch.Tensor([not done])

            prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1)
            next_state = torch.Tensor([next_state])
            # print(prev_state.shape, next_state.shape)
            reward_vehicle = torch.Tensor([reward])
            reward_attacker = torch.Tensor([env.RC - reward])
            # print(state_record[-20:])
            # print(torch.Tensor([state_record[-20:]]).shape)
            memory_vehicle.push(prev_state, action_vehicle, mask, next_state, reward_vehicle)
            memory_attacker.push(prev_state, action_attacker, mask, next_state, reward_attacker)

            state = next_state.numpy()[0]
            # print(state_record[-1].shape)

            if done:
                rewards.append(episode_reward)
                if i_episode % 100:
                    print('Episode {} ends, local_steps {}. total_steps {}, instant ave-reward is {:.4f}'.format(
                        i_episode, local_steps, total_numsteps, episode_reward))

                break

        if len(memory_vehicle) > args.batch_size:  # 开始训练
            # print('begin training')
            for _ in range(args.updates_per_step):
                transitions_vehicle = memory_vehicle.sample(args.batch_size)
                batch_vehicle = Transition(*zip(*transitions_vehicle))

                transitions_attacker = memory_attacker.sample(args.batch_size)
                batch_attacker = Transition(*zip(*transitions_attacker))
                # print(batch_vehicle)

                trans_veh = memory_SL_vehicle.sample(args.batch_size)
                trans_att = memory_SL_attacker.sample(args.batch_size)

                states_veh = []
                actions_veh = []
                states_att = []
                actions_att = []
                for sample in trans_veh:
                    state_veh, act_veh = sample
                    states_veh.append(state_veh)
                    actions_veh.append(act_veh)
                for sample in trans_att:
                    state_att, act_att = sample
                    states_att.append(state_att)
                    actions_att.append(act_att)

                states_veh = np.reshape(states_veh, (-1, env.observation_space))
                states_att = np.reshape(states_att, (-1, env.observation_space))
                actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space))
                actions_att = np.reshape(actions_att, (-1, env.attacker_action_space))

                policy_vehicle.fit(states_veh, actions_veh, verbose=False)
                policy_attacker.fit(states_att, actions_att, verbose=False)
                value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle)
                value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker)

                # writer.add_scalar('loss/value', value_loss, updates)
                # writer.add_scalar('loss/policy', policy_loss, updates)

                updates += 1

        if i_episode % 10 == 0 and i_episode > 0:
            state = env.reset()
            state_record = [np.array([state])]
            while len(state_record) < 20:
                a, b = env.random_action()
                s, _, _ = env.step(np.array([a]), np.array([b]))
                local_steps += 1
                state_record.append(s)
            evaluate_reward = 0
            while True:
                # la = np.random.randint(0, len(state_record) - 20, 1)[0]
                if random.random() < ETA:
                    action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]),
                                                                 ounoise_vehicle,
                                                                 param_noise_vehicle)[:, -1, :]
                    action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]),
                                                                   ounoise_attacker,
                                                                   param_noise_attacker)[:, -1, :]
                else:
                    action_vehicle = torch.Tensor([policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                    action_attacker = torch.Tensor([policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)) / policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy()
                next_state, reward, done = env.step(ac_v, ac_a)
                real_ac_v = ac_v[0].clip(-1, 1) + 1
                tra_ac_veh.append(real_ac_v / (sum(real_ac_v) + 0.0000001))
                tra_ac_att.append(ac_a[0])
                state_record.append(next_state)
                total_numsteps += 1
                local_steps += 1
                # print('eva_reward', reward)
                evaluate_reward += reward

                state = next_state[0]
                if done:
                    average_reward = np.mean(rewards[-10:])
                    print("{} % Episode finished, total numsteps: {}, eva-reward: {}, average reward: {}".format(
                        i_episode / args.num_episodes * 100,
                        total_numsteps,
                        evaluate_reward,
                        average_reward))
                    eva_reward.append(evaluate_reward)
                    ave_reward.append(average_reward)
                    # print(ac_v[0])
                    break
            # writer.add_scalar('reward/test', episode_reward, i_episode)
    env.close()
    df = pd.DataFrame()
    df['Eva'] = pd.Series(eva_reward)
    df['Tra'] = pd.Series(ave_reward)
    df2 = pd.DataFrame()
    df2['Weight'] = pd.Series(tra_ac_veh)
    df2['Attack'] = pd.Series(tra_ac_att)
    df.to_csv('./Result/reward_result_30.csv', index=None)
    df2.to_csv('./Result/action_result_30.csv', index=None)
    # np.savetxt('./Result/eva_result.csv', eva_reward, delimiter=',')
    # np.savetxt('./Result/ave_result.csv', ave_reward, delimiter=',')

    f = plt.figure()
    plt.plot(rewards[5:], label='Eva_reward')
    plt.show()
    AC_veh = np.array(tra_ac_veh)
    AC_att = np.array(tra_ac_att)
    # print(AC_veh.shape)
    # print(AC_veh)
    plt.plot(AC_veh[:, 0], label='Bacon1', alpha=0.2)
    plt.plot(AC_veh[:, 1], label='Bacon2', alpha=0.2)
    plt.plot(AC_veh[:, 2], label='Bacon3', alpha=0.2)
    plt.plot(AC_veh[:, 3], label='Bacon4', alpha=0.2)
    # plt.plot(ave_reward, label='Tra_ave_reward')
    plt.legend()
    plt.savefig('./Result/Veh_result_30.png', ppi=300)
    plt.show()
    # print(AC_veh.shape)
    # print(AC_veh)
    plt.plot(AC_att[:, 0], label='Attack1', alpha=0.2)
    plt.plot(AC_att[:, 1], label='Attack2', alpha=0.2)
    plt.plot(AC_att[:, 2], label='Attack3', alpha=0.2)
    plt.plot(AC_att[:, 3], label='Attack4', alpha=0.2)
    # plt.plot(ave_reward, label='Tra_ave_reward')
    # plt.title('')
    plt.legend()
    plt.savefig('./Result/Att_result_30.png', ppi=300)
    plt.show()
class Agent(object):
    """
    The Agent interacts with and learns from the environment.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed=0,
                 params=params):
        """
        Initialize an Agent object.
        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        num_agents (int): number of agents
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.params = params

        # Actor (Policy) Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.params['DEVICE'])
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.params['DEVICE'])
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['LR_ACTOR'])

        # Critic (Value) Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.params['DEVICE'])
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.params['DEVICE'])
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['LR_CRITIC'],
            weight_decay=self.params['WEIGHT_DECAY'])

        # Initialize target and local to same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'],
                                   self.params['BATCH_SIZE'], random_seed)

    def hard_update(self, local_model, target_model):
        """
        Hard update model parameters.
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def step(self, states, actions, rewards, next_states, dones):
        """
        Save experiences in replay memory and use random sample from buffer to learn.
        """

        # Save experience / reward, cater for when multiples
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > self.params['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['GAMMA'])

    def act(self, states, add_noise=True):
        """
        Returns actions for a given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(self.params['DEVICE'])
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma=params['GAMMA']):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update Critic(Value)
        # Get predicted next-state actions and Q-Values from target Network
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q Targe for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimise the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Stabilize learning per bernchmark guidelines
        self.critic_optimizer.step()

        # Update Actor (Policy)
        # Compute Actor Loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local,
                         self.critic_target,
                         tau=self.params['TAU'])
        self.soft_update(self.actor_local,
                         self.actor_target,
                         tau=self.params['TAU'])

    def soft_update(self, local_model, target_model, tau=params['TAU']):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 14
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.session = K.get_session()
        init = tf.global_variables_initializer()
        self.session.run(init)
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.score = -math.inf
        self.best_score = -math.inf
        self.last_loss = math.inf

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.noise_scale = (self.exploration_mu, self.exploration_theta,
                            self.exploration_sigma)
        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 16
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        self.total_reward = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward

        # Learn, if enough samples are available in memory
        print("Memory Size: {}, Batch Size: {}".format(len(self.memory),
                                                       self.batch_size))
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        #state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(np.array([state]))[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        print("Fitting model iteration ...")
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.array([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.array(
            [e.next_state for e in experiences if e is not None])
        print("Next states shape: {}".format(next_states.shape))
        self.score = rewards.mean()
        self.best_score = max(self.score, self.best_score)

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        r = self.actor_local.train_fn([states, action_gradients, 1])

        self.last_loss = np.mean(-action_gradients * actions)

        # custom training function Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    if __name__ == "__main__":
        state_size = (84, 296, 9)
        action_low = np.array([1, 0, 1])
        action_high = np.array([10, 359, 2000])
        net = Actor(state_size, 3, action_low, action_high)
        #net = Critic(state_size, 3)
        net.model.summary()
Ejemplo n.º 15
0
class ddpg_agent:
    def __init__(self, args, env):
        self.args = args
        self.env = env
        # get the number of inputs...
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]
        self.action_scale = self.env.action_space.high[0]
        # build up the network
        self.actor_net = Actor(num_inputs, num_actions)
        self.critic_net = Critic(num_inputs, num_actions)
        # get the target network...
        self.actor_target_net = Actor(num_inputs, num_actions)
        self.critic_target_net = Critic(num_inputs, num_actions)
        if self.args.cuda:
            self.actor_net.cuda()
            self.critic_net.cuda()
            self.actor_target_net.cuda()
            self.critic_target_net.cuda()
        # copy the parameters..
        self.actor_target_net.load_state_dict(self.actor_net.state_dict())
        self.critic_target_net.load_state_dict(self.critic_net.state_dict())
        # setup the optimizer...
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=self.args.actor_lr)
        self.optimizer_critic = torch.optim.Adam(
            self.critic_net.parameters(),
            lr=self.args.critic_lr,
            weight_decay=self.args.critic_l2_reg)
        # setting up the noise
        self.ou_noise = OUNoise(num_actions)
        # check some dir
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
        self.model_path = self.args.save_dir + self.args.env_name + '/'
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

    # start to train the network..
    def learn(self):
        # init the brain memory
        replay_buffer = []
        total_timesteps = 0
        running_reward = None
        for episode_idx in range(self.args.max_episode):
            state = self.env.reset()
            # get the scale of the ou noise...
            self.ou_noise.scale = (self.args.noise_scale - self.args.final_noise_scale) * max(0, self.args.exploration_length - episode_idx) / \
                                self.args.exploration_length + self.args.final_noise_scale
            self.ou_noise.reset()
            # start the training
            reward_total = 0
            while True:
                state_tensor = torch.tensor(state,
                                            dtype=torch.float32).unsqueeze(0)
                if self.args.cuda:
                    state_tensor = state_tensor.cuda()
                with torch.no_grad():
                    policy = self.actor_net(state_tensor)
                # start to select the actions...
                actions = self._select_actions(policy)
                # step
                state_, reward, done, _ = self.env.step(actions *
                                                        self.action_scale)
                total_timesteps += 1
                reward_total += reward
                # start to store the samples...
                replay_buffer.append((state, reward, actions, done, state_))
                # check if the buffer size is outof range
                if len(replay_buffer) > self.args.replay_size:
                    replay_buffer.pop(0)
                if len(replay_buffer) > self.args.batch_size:
                    mini_batch = random.sample(replay_buffer,
                                               self.args.batch_size)
                    # start to update the network
                    _, _ = self._update_network(mini_batch)
                if done:
                    break
                state = state_
            running_reward = reward_total if running_reward is None else running_reward * 0.99 + reward_total * 0.01
            if episode_idx % self.args.display_interval == 0:
                torch.save(self.actor_net.state_dict(),
                           self.model_path + 'model.pt')
                print('[{}] Episode: {}, Frames: {}, Rewards: {}'.format(
                    datetime.now(), episode_idx, total_timesteps,
                    running_reward))

        self.env.close()

    # select actions
    def _select_actions(self, policy):
        actions = policy.detach().cpu().numpy()[0]
        actions = actions + self.ou_noise.noise()
        actions = np.clip(actions, -1, 1)
        return actions

    # update the network
    def _update_network(self, mini_batch):
        state_batch = np.array([element[0] for element in mini_batch])
        state_batch = torch.tensor(state_batch, dtype=torch.float32)
        # reward batch
        reward_batch = np.array([element[1] for element in mini_batch])
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float32).unsqueeze(1)
        # done batch
        done_batch = np.array([int(element[3]) for element in mini_batch])
        done_batch = 1 - done_batch
        done_batch = torch.tensor(done_batch, dtype=torch.float32).unsqueeze(1)
        # action batch
        actions_batch = np.array([element[2] for element in mini_batch])
        actions_batch = torch.tensor(actions_batch, dtype=torch.float32)
        # next stsate
        state_next_batch = np.array([element[4] for element in mini_batch])
        state_next_batch = torch.tensor(state_next_batch, dtype=torch.float32)
        # check if use the cuda
        if self.args.cuda:
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            done_batch = done_batch.cuda()
            actions_batch = actions_batch.cuda()
            state_next_batch = state_next_batch.cuda()

        # update the critic network...
        with torch.no_grad():
            actions_out = self.actor_target_net(state_next_batch)
            expected_q_value = self.critic_target_net(state_next_batch,
                                                      actions_out)
        # get the target value
        target_value = reward_batch + self.args.gamma * expected_q_value * done_batch
        target_value = target_value.detach()
        values = self.critic_net(state_batch, actions_batch)
        critic_loss = (target_value - values).pow(2).mean()
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        self.optimizer_critic.step()
        # start to update the actor network
        actor_loss = -self.critic_net(state_batch,
                                      self.actor_net(state_batch)).mean()
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()
        # then, start to softupdate the network...
        self._soft_update_target_network(self.critic_target_net,
                                         self.critic_net)
        self._soft_update_target_network(self.actor_target_net, self.actor_net)

        return actor_loss.item(), critic_loss.item()

    # soft update the network
    def _soft_update_target_network(self, target, source):
        # update the critic network firstly...
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(self.args.tau * param.data +
                                    (1 - self.args.tau) * target_param.data)

    # functions to test the network
    def test_network(self):
        model_path = self.args.save_dir + self.args.env_name + '/model.pt'
        self.actor_net.load_state_dict(
            torch.load(model_path, map_location=lambda storage, loc: storage))
        self.actor_net.eval()
        # start to test
        for _ in range(5):
            state = self.env.reset()
            reward_sum = 0
            while True:
                self.env.render()
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                with torch.no_grad():
                    actions = self.actor_net(state)
                actions = actions.detach().numpy()[0]
                state_, reward, done, _ = self.env.step(self.action_scale *
                                                        actions)
                reward_sum += reward
                if done:
                    break
                state = state_
            print('The reward of this episode is {}.'.format(reward_sum))
        self.env.close()
Ejemplo n.º 16
0
def main():
    sess = tf.Session()
    K.set_session(sess)
    env = gym.make("MountainCarContinuous-v0")

    #Parameters
    memory_size = 100000
    batch_size = 32
    tau = 0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    discount_factor = 0.99
    episodes = 1001
    time_steps = 501
    collect_experience = 50000
    save_frequency = 250
    ep_reward = []
    training = False

    #Noise objecct
    noise = OUNoise(env.action_space)
    #Initialize actor and critic objects
    actor = Actor(env, sess, lr_actor, tau)

    #Uncomment to the following line to save the actor model architecture as json file. Need to be saved
    #once only

    # actor.save_model_architecture("Actor_model_architecture.json")
    critic = Critic(env, sess, lr_critic, tau, discount_factor)

    #Initialize replay memory of size defined by memory_size
    replay_memory = ReplayMemory(memory_size)

    #Toggle between true and false for debugging purposes. For training it is always true
    run = True
    if run:
        #Loop over the number of episodes. At eqach new episode reset the environment, reset the noise
        #state and set total episode reward to 0
        for episode in range(episodes):
            state = env.reset()
            noise.reset()
            episode_reward = 0

            #Loop over the number of steps in an episode
            for time in range(time_steps):
                #Uncomment the following line of you want to visualize the mountain car during training.
                #Can also be trained without visualization for the case where we are using
                #position and velocities as state variables.

                # env.render()

                #Predict an action from the actor model using the current state
                action = actor.predict_action(state.reshape((1, 2)))[0]

                #Add ohlnbeck noise to the predicted action to encourage exploration of the environment
                exploratory_action = noise.get_action(action, time)

                #Take the noisy action to enter the next state
                next_state, reward, done, _ = env.step(exploratory_action)

                #Predict the action to be taken given the next_state. This next state action is predicted
                #using the actor's target model
                next_action = actor.predict_next_action(
                    next_state.reshape((1, 2)))[0]

                #Append this experience sample to the replay memory
                replay_memory.append(state, exploratory_action, reward,
                                     next_state, next_action, done)

                #Only start training when there are a minimum number of experience samples available in
                #memory
                if replay_memory.count() == collect_experience:
                    training = True
                    print('Start training')

                #When training:
                if training:
                    # 1)first draw a random batch of samples from the replay memory
                    batch = replay_memory.sample(batch_size)
                    # 2) using this sample calculate dQ/dA from the critic model
                    grads = critic.calc_grads(batch)
                    # 3) calculate dA/dTheta from the actor using the same batch
                    # 4) multiply dA/dTheta by negative dQ/dA to get dJ/dTheta
                    # 5) Update actor weights such that dJ/dTheta is maximized
                    # 6) The above operation is easily performed by minimizing the value obtained in (4)
                    t_grads = actor.train(batch, grads)

                    # update critic weights by minimizing the bellman loss. Use actor target to compute
                    # next action in the next state (already computed and stored in replay memory)
                    # in order to compute TD target
                    critic.train(batch)

                    #After each weight update of the actor and critic online model perform soft updates
                    # of their targets so that they can smoothly and slowly track the online model's
                    #weights
                    actor.update_target()
                    critic.update_target()

                #Add each step reward to the episode reward
                episode_reward += reward

                #Set current state as next state
                state = next_state

                #If target reached before the max allowed time steps, break the inner for loop
                if done:
                    break

            #Store episode reward
            ep_reward.append([episode, episode_reward])

            #Print info for each episode to track training progress
            print(
                "Completed in {} steps.... episode: {}/{}, episode reward: {} "
                .format(time, episode, episodes, episode_reward))

            #Save model's weights and episode rewards after each save_frequency episode
            if training and (episode % save_frequency) == 0:
                print('Data saved at epsisode:', episode)
                actor.save_weights(
                    './Model/DDPG_actor_model_{}.h5'.format(episode))
                pickle.dump(
                    ep_reward,
                    open('./Rewards/rewards_{}.dump'.format(episode), 'wb'))

        # Close the mountain car environment
        env.close()
Ejemplo n.º 17
0
class Agent():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # actor policy model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # critic value model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.25
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.9  # discount rate
        self.tau = 0.1  # soft update parameter

        self.total_reward = 0
        self.count = 0
        self.score = 0
        self.best_score = -np.inf

        self.reset_episode()

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # keep track of rewards
        self.total_reward += reward
        self.count += 1
        # save experience/reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        # if there are enough experiences, learn from them
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        # returns action for a given state(s) as per the current policy
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, experiences):
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0

        # update the policy and value parameters given batch of experience tuples
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # get predicted next state and Q values from target models
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # compute Q targets for current state and train local critic model
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # train local actor model
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom train function

        # soft update target models
        self.soft_update(self.actor_local.model, self.actor_target.model)
        self.soft_update(self.critic_local.model, self.critic_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 18
0
def main():
    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.vehicle_action_space)
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space)


    vehicle_memory = ReplayMemory(1000000)
    attacker_memory = ReplayMemory(1000000)


    vehicle_ounoise = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    attacker_ounoise = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=args.noise_scale,
                                         adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=args.noise_scale,
                                         adaptation_coefficient=1.05) if args.param_noise else None

    rewards = []
    total_numsteps = 0
    updates = 0

    for i_episode in range(args.num_episodes):
        state = torch.Tensor([[env.reset()]])  # 4-dimensional velocity observation

        if args.ou_noise:
            vehicle_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                              i_episode) / args.exploration_end + args.final_noise_scale
            vehicle_ounoise.reset()

            attacker_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            attacker_ounoise.reset()

        episode_reward = 0

        while True:
            action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle)
            action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker)

            next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0])
            total_numsteps += 1
            episode_reward += reward

            action_vehicle = torch.Tensor(action_vehicle)
            action_attacker = torch.Tensor(action_attacker)

            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])

            reward_vehicle = torch.Tensor([-reward])
            reward_attacker = torch.Tensor([env.RC+reward])

            vehicle_memory.push(state, action_vehicle, mask, next_state, reward_vehicle)
            attacker_memory.push(state, action_attacker, mask, next_state, reward_attacker)

            state = next_state

            if len(vehicle_memory) > args.batch_size:
                for _ in range(args.updates_per_step):
                    transitions_vehicle = vehicle_memory.sample(args.batch_size)
                    batch_vehicle = Transition(*zip(*transitions_vehicle))

                    transition_attacker = attacker_memory.sample(args.batch_size)
                    batch_attacker = Transition(*zip(*transition_attacker))

                    value_loss_1, policy_loss_1 = agent_vehicle.update_parameters(batch_vehicle)
                    value_loss_2, policy_loss_2 = agent_attacker.update_parameters(batch_attacker)

                    # writer.add_scalar('loss/value', value_loss, updates)
                    # writer.add_scalar('loss/policy', policy_loss, updates)

                    updates += 1

            if done:
                break

        # writer.add_scalar('reward/train', episode_reward, i_episode)

        # Update param_noise based on distance metric
        if args.param_noise:
            episode_transitions_vehicle = vehicle_memory.memory[vehicle_memory.position - t:vehicle_memory.position]
            states_vehicle = torch.cat([transition[0] for transition in episode_transitions_vehicle], 0)
            unperturbed_actions_vehicle = agent_vehicle.select_action(states_vehicle, None, None)
            perturbed_actions_vehicle = torch.cat([transition[1] for transition in episode_transitions_vehicle], 0)

            ddpg_dist_vehicle = ddpg_distance_metric(perturbed_actions_vehicle.numpy(), unperturbed_actions_vehicle.numpy())
            param_noise_vehicle.adapt(ddpg_dist_vehicle)

            episode_transitions_attacker = attacker_memory.memory[attacker_memory.position - t:attacker_memory.position]
            states_attacker = torch.cat([transition[0] for transition in episode_transitions_attacker], 0)
            unperturbed_actions_attacker = agent_attacker.select_action(states_attacker, None, None)
            perturbed_actions_attacker = torch.cat([transition[1] for transition in episode_transitions_attacker], 0)

            ddpg_dist_attacker = ddpg_distance_metric(perturbed_actions_attacker.numpy(), unperturbed_actions_attacker.numpy())
            param_noise_attacker.adapt(ddpg_dist_attacker)

        rewards.append(episode_reward)

        if i_episode % 10 == 0:
            state = torch.Tensor([[env.reset()]])
            episode_reward = 0
            while True:
                action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle)
                action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker)

                next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0])
                episode_reward += reward

                next_state = torch.Tensor([[next_state]])

                state = next_state
                if done:
                    break

            # writer.add_scalar('reward/test', episode_reward, i_episode)

            rewards.append(episode_reward)
            print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps,
                                                                                           rewards[-1],
                                                                                           np.mean(rewards[-10:])))

    env.close()
Ejemplo n.º 19
0
class DDPG(object):
    """Interacts with and learns from the environment.
    There are two agents and the observations of each agent has 24 dimensions. Each agent's action has 2 dimensions.
    Will use two separate actor networks (one for each agent using each agent's observations only and output that agent's action).
    The critic for each agents gets to see the actions and observations of all agents. """
    def __init__(self, state_size, action_size, num_agents):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state for each agent
            action_size (int): dimension of each action for each agent
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(DEVICE)
        self.actor_target = Actor(state_size, action_size).to(DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR,
                                          weight_decay=WEIGHT_DECAY_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(num_agents * state_size,
                                   num_agents * action_size).to(DEVICE)
        self.critic_target = Critic(num_agents * state_size,
                                    num_agents * action_size).to(DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY_critic)

        # Noise process
        self.noise = OUNoise(action_size)  #single agent only
        self.noise_scale = NOISE_START

        # Make sure target is initialized with the same weight as the source (makes a big difference)
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

    def act(self, states, i_episode, add_noise=True):
        """Returns actions for given state as per current policy."""

        if i_episode > EPISODES_BEFORE_TRAINING and self.noise_scale > NOISE_END:
            #self.noise_scale *= NOISE_REDUCTION
            self.noise_scale = NOISE_REDUCTION**(i_episode -
                                                 EPISODES_BEFORE_TRAINING)
        #else keep the previous value

        if not add_noise:
            self.noise_scale = 0.0

        states = torch.from_numpy(states).float().to(DEVICE)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()

        #add noise
        actions += self.noise_scale * self.add_noise2(
        )  #works much better than OU Noise process
        #actions += self.noise_scale*self.noise.sample()

        return np.clip(actions, -1, 1)

    def add_noise2(self):
        noise = 0.5 * np.random.randn(
            1, self.action_size
        )  #sigma of 0.5 as sigma of 1 will have alot of actions just clipped
        return noise

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        #for MADDPG
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        full_states, actor_full_actions, full_actions, agent_rewards, agent_dones, full_next_states, critic_full_next_actions = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get Q values from target models
        Q_target_next = self.critic_target(full_next_states,
                                           critic_full_next_actions)
        # Compute Q targets for current states (y_i)
        Q_target = agent_rewards + gamma * Q_target_next * (1 - agent_dones)
        # Compute critic loss
        Q_expected = self.critic_local(full_states, full_actions)
        critic_loss = F.mse_loss(
            input=Q_expected, target=Q_target
        )  #target=Q_targets.detach() #not necessary to detach
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1.0) #clip the gradient for the critic network (Udacity hint)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actor_loss = -self.critic_local.forward(
            full_states, actor_full_actions).mean(
            )  #-ve b'cse we want to do gradient ascent
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def soft_update_all(self):
        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, source_param in zip(target.parameters(),
                                              source.parameters()):
            target_param.data.copy_(source_param.data)
class DDPG:
    def __init__(self, task):
        # Hyper parameters
        self.learning_rate_actor = 1e-4
        self.learning_rate_critic = 1e-3
        self.gamma = 0.99
        self.tau = 0.001

        # Define net
        self.sess = tf.Session()
        self.task = task
        self.actor = ActorNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_actor, \
                     self.task.action_low, self.task.action_high, self.tau)
        self.critic = CriticNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_critic, self.tau)

        # Define noise
        self.mu = 0
        self.theta = 0.15
        self.sigma = 0.20
        self.noise = OUNoise(self.task.action_size, self.mu, self.theta, self.sigma)

        # Define memory replay
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = Replay(self.buffer_size, self.batch_size)

        # Score
        self.best_score = -np.inf
        self.best_reward = -np.inf

    def reset(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.total_reward = 0.0
        self.count = 0
        return state

    def learn(self, experience):
        # Turn into different np arrays
        state_batch = np.vstack([e[0] for e in experience])
        action_batch = np.vstack([e[1] for e in experience])
        reward_batch = np.vstack([e[2] for e in experience])
        next_state_batch = np.vstack([e[3] for e in experience])
        done_batch = np.vstack([e[4] for e in experience])

        # Calculate next_state q value
        next_action_batch = self.actor.target_actions(next_state_batch)
        next_q_targets = self.critic.targetQ(next_state_batch, next_action_batch)

        # Train critic net
        q_targets = reward_batch + self.gamma * next_q_targets * (1 - done_batch)
        self.critic.train(state_batch, action_batch, q_targets)

        # Train actor net
        action_gradients = self.critic.gradients(state_batch, action_batch)
        self.actor.train(action_gradients, state_batch)

        # Update target network
        self.actor.update_target(False)
        self.critic.update_target(False)

    def step(self, action, reward, next_state, done):
        self.memory.add([self.last_state, action, reward, next_state, done])
        self.total_reward += reward
        self.count += 1
        if done:
            self.score = self.total_reward / float(self.count) if self.count else 0.0
            self.best_score = max(self.best_score, self.score)
            self.best_reward = max(self.total_reward, self.best_reward)

        if len(self.memory.buffer) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        states = np.reshape(states, [-1, self.task.state_size])
        action = self.actor.actions(states)[0]
        return list(action + self.noise.sample())
Ejemplo n.º 21
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(
        self,
        num_agents,
        state_size,
        action_size,
        buffer_size=int(1e5),
        batch_size=128,
        gamma=0.99,
        tau=1e-3,
        lr_actor=1e-4,
        lr_critic=1e-3,
        weight_decay=0,
        random_seed=2,
    ):
        """Initialize an Agent object.

        Params
        ======
            num_agents (int): number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(
            action_size=action_size,
            buffer_size=buffer_size,
            batch_size=batch_size,
            seed=random_seed,
        )

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       1)  # clip gradients at 1
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPGAgent:
    '''
    DDPG Agent implementation
    '''
    def __init__(self, agent_id, state_size, action_size, rand_seed,
                 meta_agent):
        """ Creates a new DDPG Agent """

        self.agent_id = agent_id
        self.action_size = action_size

        # Defines the Actor Networks
        self.actor_local = Actor(state_size, action_size, rand_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  rand_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Defines the Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   meta_agent.agents_qty, rand_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    meta_agent.agents_qty,
                                    rand_seed).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=LR_CRITIC)  #, weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, rand_seed)

        # Refers to the MA agent memory
        self.memory = meta_agent.memory

        self.t_step = 0

    def step(self):

        # Takes an step
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        (states_list, actions_list, rewards, next_states_list,
         dones) = experiences

        # Get the target actions for all the states
        l_all_next_actions = []
        for states in states_list:
            l_all_next_actions.append(self.actor_target(states))

        # Convert the experiences into Torch tensors
        all_next_actions = torch.cat(l_all_next_actions, dim=1).to(device)
        all_next_states = torch.cat(next_states_list, dim=1).to(device)
        all_states = torch.cat(states_list, dim=1).to(device)
        all_actions = torch.cat(actions_list, dim=1).to(device)

        Q_targets_next = self.critic_target(all_next_states, all_next_actions)

        # Calculates the Q function using all the next states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # --------------------------- update actor ---------------------------
        actions_pred = []
        for states in states_list:
            actions_pred.append(self.actor_local(states))

        actions_pred = torch.cat(actions_pred, dim=1).to(device)

        actor_loss = -self.critic_local(all_states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------------- update target networks ----------------------
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def act(self, states, add_noise=True):
        """ Returns the actions to take by the agent"""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """ Performs the softupdate """
        iter_params = zip(target_model.parameters(), local_model.parameters())
        for target_param, local_param in iter_params:
            tensor_aux = tau * local_param.data + (1.0 -
                                                   tau) * target_param.data
            target_param.data.copy_(tensor_aux)

    def reset(self):
        self.noise.reset()
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch X-job')
    parser.add_argument('--env_name',
                        default="Pendulum-v0",
                        help='name of the environment')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        help='discount factor for model (default: 0.001)')
    parser.add_argument('--ou_noise', type=bool, default=True)
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.4,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.4)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=33,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    parser.add_argument('--seed',
                        type=int,
                        default=4,
                        metavar='N',
                        help='random seed (default: 4)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=200,
                        metavar='N',
                        help='batch size (default: 512)')
    parser.add_argument('--num_steps',
                        type=int,
                        default=100,
                        metavar='N',
                        help='max episode length (default: 300)')
    parser.add_argument('--num_episodes',
                        type=int,
                        default=5000,
                        metavar='N',
                        help='number of episodes (default: 5000)')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='hidden size (default: 128)')
    parser.add_argument('--updates_per_step',
                        type=int,
                        default=5,
                        metavar='N',
                        help='model updates per simulator step (default: 50)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--save_agent',
                        type=bool,
                        default=True,
                        help='save model to file')
    parser.add_argument('--train_model',
                        type=bool,
                        default=True,
                        help='Training or run')
    parser.add_argument('--load_agent',
                        type=bool,
                        default=False,
                        help='load model from file')
    parser.add_argument('--load_exp',
                        type=bool,
                        default=False,
                        help='load saved experience')
    parser.add_argument('--greedy_steps',
                        type=int,
                        default=10,
                        metavar='N',
                        help='amount of times greedy goes (default: 10)')

    args = parser.parse_args()

    env = ManipulateEnv()
    #env = gym.make(args.env_name)
    writer = SummaryWriter('runs/')

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # -- initialize agent --
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                env.observation_space.shape[0], env.action_space)

    # -- declare memory buffer and random process N
    memory = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None

    # -- load existing model --
    if args.load_agent:
        agent.load_model(args.env_name, args.batch_size, args.num_episodes,
                         '.pth')
        print("agent: naf_{}_{}_{}_{}, is loaded".format(
            args.env_name, args.batch_size, args.num_episodes, '.pth'))

    # -- load experience buffer --
    if args.load_exp:
        with open(
                '/home/quantao/Workspaces/catkin_ws/src/panda_demos/naf_env/src/exp_replay.pk1',
                'rb') as input:
            memory.memory = pickle.load(input)
            memory.position = len(memory)

    rewards = []
    total_numsteps = 0
    updates = 0

    #env.init_ros()
    #env.reset()

    t_start = time.time()

    for i_episode in range(args.num_episodes + 1):
        # -- reset environment for every episode --
        #state = env.reset()
        state = torch.Tensor([env.reset()])

        # -- initialize noise (random process N) --
        if args.ou_noise:
            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end - i_episode / args.exploration_end +
                args.final_noise_scale)
            ounoise.reset()

        episode_reward = 0
        while True:
            # -- action selection, observation and store transition --
            action = agent.select_action(
                state,
                ounoise) if args.train_model else agent.select_action(state)

            next_state, reward, done, info = env.step(action)

            #env.render()
            total_numsteps += 1
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])
            next_state = torch.Tensor([next_state])

            #print('reward:', reward)
            memory.push(state, action, mask, next_state, reward)

            state = next_state

            #else:
            #    time.sleep(0.005)

            #env.render()
            #time.sleep(0.005)
            #env.rate.sleep()

            if done or total_numsteps % args.num_steps == 0:
                break

        if len(memory) >= args.batch_size and args.train_model:
            env.reset()
            print("Training model")

            for _ in range(args.updates_per_step * args.num_steps):
                transitions = memory.sample(args.batch_size)
                batch = Transition(*zip(*transitions))
                value_loss, policy_loss = agent.update_parameters(batch)

                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)

                updates += 1
        writer.add_scalar('reward/train', episode_reward, i_episode)
        print("Train Episode: {}, total numsteps: {}, reward: {}".format(
            i_episode, total_numsteps, episode_reward))

        rewards.append(episode_reward)

        greedy_numsteps = 0
        if i_episode % 10 == 0:
            #state = env.reset()
            state = torch.Tensor([env.reset()])

            episode_reward = 0
            while True:
                action = agent.select_action(state)

                next_state, reward, done, info = env.step(action)
                episode_reward += reward
                greedy_numsteps += 1

                #state = next_state
                state = torch.Tensor([next_state])

                #env.render()
                #time.sleep(0.01)
                #   env.rate.sleep()

                if done or greedy_numsteps % args.num_steps == 0:
                    break

            writer.add_scalar('reward/test', episode_reward, i_episode)

            rewards.append(episode_reward)
            print(
                "Episode: {}, total numsteps: {}, reward: {}, average reward: {}"
                .format(i_episode, total_numsteps, rewards[-1],
                        np.mean(rewards[-10:])))

    #-- saves model --
    if args.save_agent:
        agent.save_model(args.env_name, args.batch_size, args.num_episodes,
                         '.pth')
        with open('exp_replay.pk1', 'wb') as output:
            pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)

    print('Training ended after {} minutes'.format(
        (time.time() - t_start) / 60))
    print('Time per episode: {} s'.format(
        (time.time() - t_start) / args.num_episodes))
    print('Mean reward: {}'.format(np.mean(rewards)))
    print('Max reward: {}'.format(np.max(rewards)))
    print('Min reward: {}'.format(np.min(rewards)))
Ejemplo n.º 24
0
    # Add expert data into replay buffer
    from expert_data import generate_Data
    replay_buffer = generate_Data(env, 300, "random", replay_buffer)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, args.env_name, args.seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    model_save_path = "kinova_gripper_learning{}.pt".format(args.model)

    noise = OUNoise(4)
    noise.reset()
    expl_noise = OUNoise(4, sigma=0.001)
    expl_noise.reset()
    for t in range(int(args.max_timesteps)):

        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < args.start_timesteps:
            # action = env.action_space.sample()
            # action = noise.noise().clip(-max_action, max_action)
            obs = torch.FloatTensor(np.array(state).reshape(1, -1)).to(device)
            action = pretrained_network(obs).cpu().data.numpy().flatten()

        else:
            # action = (
Ejemplo n.º 25
0
ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, 
    desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None

rewards = []
total_numsteps = 0
updates = 0

for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()])

    if args.ou_noise: 
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                      i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()

    if args.param_noise and args.algo == "DDPG":
        agent.perturb_actor_parameters(param_noise)

    episode_reward = 0
    while True:
        action = agent.select_action(state, ounoise, param_noise)
        next_state, reward, done, _ = env.step(action.numpy()[0])
        total_numsteps += 1
        episode_reward += reward

        action = torch.Tensor(action)
        mask = torch.Tensor([not done])
        next_state = torch.Tensor([next_state])
        reward = torch.Tensor([reward])
Ejemplo n.º 26
0
class Agent():
    """ DDPG Agent, interacts with environment and learns from environment """
    def __init__(self, device, state_size, n_agents, action_size, random_seed, \
                         buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay,  \
                         learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'):

        # Set Computational device
        self.DEVICE = device

        # Init State, action and agent dimensions
        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.l_step = 0
        self.log_interval = 200

        # Init Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num

        # Init Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Init Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Init Noise Process
        self.noise = OUNoise((n_agents, action_size),
                             random_seed,
                             mu=0.,
                             theta=ou_theta,
                             sigma=ou_sigma)

        # Init Replay Memory
        self.memory = ReplayBuffer(device, action_size, buffer_size,
                                   batch_size, random_seed)

    # think
    def act(self, states, add_noise=True):
        """ Decide what action to take next """

        # evaluate state through actor_local
        states = torch.from_numpy(states).float().to(self.DEVICE)
        actions = np.zeros((self.n_agents, self.action_size))

        self.actor_local.eval()  # put actor_local network in "evaluation" mode
        with torch.no_grad():
            for n, state in enumerate(states):
                actions[n, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()  # put actor_local back into "training" mode

        # add noise for better performance
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    # embody
    def step(self, t, s, a, r, s_, done):
        """ Commit step into the brain """

        # Save SARS' to replay buffer --- state-action-reward-next_state tuple
        for n in range(self.n_agents):
            # self.memory.add(s, a, r, s_, done)

            # print ("going to learn 10 times")

            self.memory.add(s[n], a[n], r[n], s_[n], done[n])

        if t % self.LEARN_INTERVAL != 0:
            return

        # Learn (if enough samples are available in memory        )
        if len(self.memory) > self.BATCH_SIZE:
            # print ("going to learn 10 times")
            for _ in range(self.LEARN_NUM):
                experiences = self.memory.sample()  # get a memory sample
                self.learn(experiences, self.GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Learn from experiences, with discount factor gamma
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Params:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ------ Update Critic ------ #

        # get predicted next-state actions and Q values from target networks
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #         torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ------ Update Actor ------ #

        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------ Update Target Networks ------ #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

        # keep count of steps taken
        # self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 27
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        self.score = 0
        self.best_score = -np.inf
        self.noise_scale = 0.1

    def reset_episode(self):
        self.noise.reset()
        self.total_reward = 0.0
        self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
            self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        else:
            self.noise_scale = min(2.0 * self.noise_scale, 3.2)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 28
0
class DDPGAgent:
    def __init__(self, config, state_size, action_size):
        super(DDPGAgent, self).__init__()
        l1 = config['network']['hidden']
        l2 = int(config['network']['hidden'] / 2)
        self.actor = Actor(state_size, action_size, config['seed']['agent'],
                           l1, l2).to(device)
        self.critic = Critic(state_size, action_size, config['seed']['agent'],
                             l1, l2).to(device)
        self.target_actor = Actor(state_size, action_size,
                                  config['seed']['agent'], l1, l2).to(device)
        self.target_critic = Critic(state_size, action_size,
                                    config['seed']['agent'], l1, l2).to(device)

        self.noise = OUNoise(action_size,
                             mu=config['noise']['mu'],
                             sigma=config['noise']['sigma'],
                             theta=config['noise']['theta'])

        # initialize targets same as original networks
        self.hard_update(self.target_actor, self.actor)
        self.hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=config['LR_ACTOR'])
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=config['LR_CRITIC'])

    def resetNoise(self):
        self.noise.reset()

    def act(self, obs, noise=0.0):
        action = self.actor(obs) + noise * self.noise.noise()
        action = np.clip(action.detach().numpy(), -1, 1)
        return action

    def target_act(self, obs, noise=0.0):
        action = self.target_actor(obs) + noise * self.noise.noise()
        return action

    def learn(self, experiences, gamma, tau):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.target_actor(next_states)
        Q_targets_next = self.target_critic(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        cl = critic_loss.cpu().detach().item()
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #from https://github.com/hortovanyi/DRLND-Continuous-Control/blob/master/ddpg_agent.py
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor(states)
        actor_loss = -self.critic(states, actions_pred).mean()
        al = actor_loss.cpu().detach().item()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic, self.target_critic, tau)
        self.soft_update(self.actor, self.target_actor, tau)

        return [al, cl]

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    # https://github.com/ikostrikov/pytorch-ddpg-naf/blob/master/ddpg.py#L15
    def hard_update(self, target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Ejemplo n.º 29
0
        In every generation, the population is evaluated, ranked, mutated, and re-instered into population
        '''
        evo.evaluate_pop()
        evo.rank_pop_selection_mutation()

        print("Evolutionary Fitness = " + str(evo.best_policy.fitness))
        '''
        #############
        The DDPG part
        #############
        '''
        state = torch.Tensor([env.reset()])  # algo line 6
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
            0, args.exploration_end -
            i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()
        episode_reward = 0

        for t in range(args.num_steps):  # line 7
            # forward pass through the actor network
            action = agent.select_action(state, ounoise)  # line 8
            next_state, reward, done, _ = env.step(action.numpy()[0])  # line 9
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])
            reward = torch.Tensor([reward])

            # if i_episode % 10 == 0:
            #     env.render()
Ejemplo n.º 30
0
def main():
    cfg = ConfigParser()
    cfg.read('config.ini')

    IP = cfg.get('server', 'ip')
    PORT = cfg.getint('server', 'port')
    FILE = cfg.get('file', 'file')
    SIZE = cfg.getint('env', 'buffer_size')
    TIME = cfg.getfloat('env', 'time')
    EPISODE = cfg.getint('env', 'episode')

    parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')

    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        metavar='G',
                        help='discount factor for model (default: 0.001)')

    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.3)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')

    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='number of hidden size (default: 128)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--updates_per_step',
                        type=int,
                        default=5,
                        metavar='N',
                        help='model updates per simulator step (default: 5)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='batch size (default: 128)')

    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.connect((IP, PORT))
    fd = sock.fileno()
    my_env = env(fd=fd, buff_size=SIZE, time=TIME, k=8, l=0.01, n=0.03, p=0.05)
    mpsched.persist_state(fd)

    args = parser.parse_args()
    agent = NAF_CNN(args.gamma, args.tau, args.hidden_size,
                    my_env.observation_space.shape[0], my_env.action_space)
    memory = ReplayMemory(args.replay_size)
    ounoise = OUNoise(my_env.action_space.shape[0])

    rewards = []
    times = []
    for i_episode in range(EPISODE):
        if (i_episode < 0.9 * EPISODE):  # training
            io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE)
            io.start()

            state = my_env.reset()

            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end -
                i_episode) / args.exploration_end + args.final_noise_scale
            ounoise.reset()
            print(state)
            episode_reward = 0
            while True:
                state = torch.FloatTensor(state)
                #print("state: {}\n ounoise: {}".format(state, ounoise.scale))
                action = agent.select_action(state, ounoise)
                #print("action: {}".format(action))
                next_state, reward, count, recv_buff_size, done = my_env.step(
                    action)
                #print("buff size: ",recv_buff_size)
                #print("reward: ", reward)
                episode_reward += reward

                action = torch.FloatTensor(action)
                mask = torch.Tensor([not done])
                next_state = torch.FloatTensor(next_state)
                reward = torch.FloatTensor([float(reward)])
                memory.push(state, action, mask, next_state, reward)

                state = next_state

                if len(memory) > args.batch_size * 5:
                    for _ in range(args.updates_per_step):
                        transitions = memory.sample(args.batch_size)
                        batch = Transition(*zip(*transitions))
                        #print("update",10*'--')
                        agent.update_parameters(batch)

                if done:
                    break
            rewards.append(episode_reward)
            io.join()
        else:  # testing
            io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE)
            io.start()
            state = my_env.reset()
            episode_reward = 0
            start_time = time.time()
            while True:
                state = torch.FloatTensor(state)
                #print("state: {}\n".format(state))
                action = agent.select_action(state)
                #print("action: {}".format(action))
                next_state, reward, count, done = my_env.step(action)
                episode_reward += reward
                state = next_state

                if done:
                    break
            rewards.append(episode_reward)
            times.append(str(time.time() - start_time) + "\n")
            io.join()
        #print("Episode: {}, noise: {}, reward: {}, average reward: {}".format(i_episode, ounoise.scale, rewards[-1], np.mean(rewards[-100:])))
        fo = open("times.txt", "w")
        fo.writelines(lines)
        fo.close()

    sock.close()