コード例 #1
0
def main():
    my_env = env()

    agent = NAF_CNN(0.99, 0.001, 128, my_env.observation_space.shape[0],
                    my_env.action_space)

    parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.3)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    args = parser.parse_args()

    ounoise = OUNoise(my_env.action_space.shape[0])
    ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
        0, args.exploration_end -
        1) / args.exploration_end + args.final_noise_scale
    ounoise.reset()

    state = my_env.reset()
    i = 10
    while i > 0:
        action = agent.select_action(state, ounoise)
        print("action: {}".format(action))
        next_state, reward, done = my_env.step(action)
        if done:
            break
        print(reward)
        i = i - 1
コード例 #2
0
ファイル: main.py プロジェクト: tuladhay/Evo_RL_Summer18
        '''
        Here, num_episodes correspond to the generations in Algo 1.
        In every generation, the population is evaluated, ranked, mutated, and re-instered into population
        '''
        evo.evaluate_pop()
        evo.rank_pop_selection_mutation()

        print("Evolutionary Fitness = " + str(evo.best_policy.fitness))
        '''
        #############
        The DDPG part
        #############
        '''
        state = torch.Tensor([env.reset()])  # algo line 6
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
            0, args.exploration_end -
            i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()
        episode_reward = 0

        for t in range(args.num_steps):  # line 7
            # forward pass through the actor network
            action = agent.select_action(state, ounoise)  # line 8
            next_state, reward, done, _ = env.step(action.numpy()[0])  # line 9
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])
            reward = torch.Tensor([reward])
コード例 #3
0
ファイル: main.py プロジェクト: votegrasp/real_robot_grasping
def main():
    global subdata
    t_start = time.time()

    parser = argparse.ArgumentParser(description='PyTorch X-job')
    parser.add_argument('--env_name',
                        default="OurEnv-v0",
                        help='name of the environment')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        help='discount factor for model (default: 0.001)')
    parser.add_argument('--ou_noise', type=bool, default=True)
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.4,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.4)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=33,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    parser.add_argument('--seed',
                        type=int,
                        default=4,
                        metavar='N',
                        help='random seed (default: 4)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=512,
                        metavar='N',
                        help='batch size (default: 512)')
    parser.add_argument('--num_steps',
                        type=int,
                        default=300,
                        metavar='N',
                        help='max episode length (default: 1000)')
    parser.add_argument('--num_episodes',
                        type=int,
                        default=50,
                        metavar='N',
                        help='number of episodes (default: 1000)')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='hidden size (default: 128)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--save_agent',
                        type=bool,
                        default=True,
                        help='save model to file')
    parser.add_argument('--load_agent',
                        type=bool,
                        default=False,
                        help='load model from file')
    parser.add_argument('--train_model',
                        type=bool,
                        default=True,
                        help='Training or run')
    parser.add_argument('--load_exp',
                        type=bool,
                        default=False,
                        help='load saved experience')
    parser.add_argument('--state_plot',
                        type=bool,
                        default=True,
                        help='plot Q values for environment')
    parser.add_argument('--greedy_steps',
                        type=int,
                        default=5,
                        metavar='N',
                        help='amount of times greedy goes (default: 100)')

    args = parser.parse_args()

    #env = gym.make(args.env_name)

    env = Env()

    #env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # -- initialize agent, Q and Q' --
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                env.observation_space.shape[0], env.action_space)

    # -- declare memory buffer and random process N
    memory = ReplayMemory(args.replay_size)
    memory_g = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None

    # -- load existing model --
    if args.load_agent:
        agent.load_model(args.env_name, args.batch_size, '.pth')
        print("agent: naf_{}_{}_{}, is loaded").format(args.env_name,
                                                       args.batch_size, '.pth')
    # -- load experience buffer --
    if args.load_exp:
        with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1',
                  'rb') as input:
            memory.memory = pickle.load(input)
            memory.position = len(memory)

    #sate_Q_plot(agent, 50)

    rewards = []
    total_numsteps = 0
    greedy_reward = []
    avg_greedy_reward = []
    upper_reward = []
    lower_reward = []
    steps_to_goal = []
    avg_steps_to_goal = []
    state_plot = []

    sim_reset_start()

    pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10)
    rospy.Subscriber("/ee_rl/state", StateMsg, callback)
    rate = rospy.Rate(9)
    rate.sleep()

    for i_episode in range(args.num_episodes + 1):
        # -- reset environment for every episode --
        sim_reset()
        state = torch.Tensor(subdata).unsqueeze(0)

        # -- initialize noise (random process N) --
        if args.ou_noise:
            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end - i_episode / args.exploration_end +
                args.final_noise_scale)
            ounoise.reset()

        episode_reward = 0

        while True:
            # -- action selection, observation and store transition --
            action = agent.select_action(
                state,
                ounoise) if args.train_model else agent.select_action(state)
            a = action.numpy()[0] * 50
            act_pub = [a[0], a[1]]
            pub.publish(act_pub)
            next_state = torch.Tensor(subdata).unsqueeze(0)
            reward, done, _ = env.calc_shaped_reward(next_state)

            total_numsteps += 1
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])

            memory.push(state, action, mask, next_state, reward)
            # if done:
            #     for i in range(total_numsteps % args.num_steps):
            #         a = i+1
            #         memory_g.memory.append(memory.memory[-a])
            #         memory_g.position += 1

            state = next_state

            #-- training --
            # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model:
            #     for _ in range(10):
            #         transitions_b = memory.sample(args.batch_size/2)
            #         transitions_g = memory_g.sample(args.batch_size/2)
            #         for i in range(transitions_g):
            #             transitions_b.append(transitions_g[i])
            #         batch = Transition(*zip(*transitions_b))
            #         agent.update_parameters(batch)

            if len(memory) > args.batch_size and args.train_model:
                for _ in range(10):
                    transitions = memory.sample(args.batch_size)
                    batch = Transition(*zip(*transitions))
                    agent.update_parameters(batch)

            else:
                time.sleep(0.1)
            rate.sleep()

            if done or total_numsteps % args.num_steps == 0:
                break

        pub.publish([0, 0])
        rewards.append(episode_reward)

        # -- plot Q value --
        if i_episode % 10 == 0:

            sate_Q_plot(agent, i_episode)
            # -- saves model --
            if args.save_agent:
                agent.save_model(args.env_name, args.batch_size, i_episode,
                                 '.pth')
                with open('exp_replay.pk1', 'wb') as output:
                    pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
                #with open('exp_replay_g.pk1', 'wb') as output:
                #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

        if args.train_model:
            greedy_episode = max(args.num_episodes / 100, 5)
        else:
            greedy_episode = 10
        greedy_range = min(args.greedy_steps, greedy_episode)

        # -- calculates episode without noise --
        if i_episode % greedy_episode == 0 and not i_episode == 0:
            for _ in range(0, greedy_range + 1):
                # -- reset environment for every episode --
                sim_reset()
                state_visited = []
                action_taken = []
                print("Greedy episode ongoing")

                state = torch.Tensor(subdata).unsqueeze(0)
                episode_reward = 0
                steps = 0

                state_plot.append([])
                st = state.numpy()[0]
                sta = [st[0], st[1]]
                state_plot[_].append(sta)

                while True:
                    action = agent.select_action(state)
                    a = action.numpy()[0] * 50
                    act_pub = [a[0], a[1]]
                    pub.publish(act_pub)
                    next_state = torch.Tensor(subdata).unsqueeze(0)
                    reward, done, obs_hit = env.calc_shaped_reward(next_state)
                    episode_reward += reward

                    state_visited.append(state)
                    action_taken.append(action)

                    state = next_state

                    steps += 1
                    if done or steps == args.num_steps:
                        greedy_reward.append(episode_reward)
                        break
                    rate.sleep()

                if obs_hit:
                    steps = 300

                steps_to_goal.append(steps)

                # -- plot path --
                if i_episode % 10 == 0:
                    agent.plot_path(state_visited, action_taken, i_episode)

            upper_reward.append((np.max(greedy_reward[-greedy_range:])))
            lower_reward.append((np.min(greedy_reward[-greedy_range:])))
            avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:])))
            avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:])))

            print(
                "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}"
                .format(i_episode, total_numsteps, avg_greedy_reward[-1],
                        np.mean(rewards[-greedy_episode:])))

    #-- saves model --
    if args.save_agent:
        agent.save_model(args.env_name, args.batch_size, i_episode, '.pth')
        with open('exp_replay.pk1', 'wb') as output:
            pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
        #with open('exp_replay_g.pk1', 'wb') as output:
        #    pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

    print('Training ended after {} minutes'.format(
        (time.time() - t_start) / 60))
    print('Time per ep : {} s').format(
        (time.time() - t_start) / args.num_episodes)
    print('Mean greedy reward: {}'.format(np.mean(greedy_reward)))
    print('Mean reward: {}'.format(np.mean(rewards)))
    print('Max reward: {}'.format(np.max(rewards)))
    print('Min reward: {}'.format(np.min(rewards)))

    # -- plot learning curve --
    pos_greedy = []
    for pos in range(0, len(lower_reward)):
        pos_greedy.append(pos * greedy_episode)

    plt.title('Greedy policy outcome')
    plt.fill_between(pos_greedy,
                     lower_reward,
                     upper_reward,
                     facecolor='red',
                     alpha=0.3)
    plt.plot(pos_greedy, avg_greedy_reward, 'r')
    plt.xlabel('Number of episodes')
    plt.ylabel('Rewards')
    fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname1)
    plt.close()

    plt.title('Steps to reach goal')
    plt.plot(steps_to_goal)
    plt.ylabel('Number of steps')
    plt.xlabel('Number of episodes')
    fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname2)
    plt.close()
コード例 #4
0
ファイル: main.py プロジェクト: lenvdv/pytorch-ddpg-naf
memory = ReplayMemory(args.replay_size)

ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, 
    desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None

rewards = []
total_numsteps = 0
updates = 0

for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()])

    if args.ou_noise: 
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                      i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()

    if args.param_noise and args.algo == "DDPG":
        agent.perturb_actor_parameters(param_noise)

    episode_reward = 0
    while True:
        action = agent.select_action(state, ounoise, param_noise)
        next_state, reward, done, _ = env.step(action.numpy()[0])
        total_numsteps += 1
        episode_reward += reward

        action = torch.Tensor(action)
        mask = torch.Tensor([not done])
        next_state = torch.Tensor([next_state])
コード例 #5
0
def fit_nash():
    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space)
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space)

    policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space)
    policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space)

    memory_vehicle = ReplayMemory(1000000)
    memory_attacker = ReplayMemory(1000000)

    memory_SL_vehicle = ReplayMemory(100000)
    memory_SL_attacker = ReplayMemory(100000)

    ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                 desired_action_stddev=args.noise_scale,
                                                 adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                  desired_action_stddev=args.noise_scale,
                                                  adaptation_coefficient=1.05) if args.param_noise else None

    rewards = []
    eva_reward = []
    ave_reward = []
    eva_ac_veh = []
    eva_ac_att = []
    total_numsteps = 0
    updates = 0
    # while len(state_record) < 20:
    #     s, _, _ = env.step(env.random_action())
    #     state_record.append(s)
    for i_episode in range(args.num_episodes):
        state = env.reset()
        if args.ou_noise:
            ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_vehicle.reset()

            ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                       i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_attacker.reset()
        episode_reward = 0
        while True:
            if random.random() < ETA:
                action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle,
                                                             param_noise_vehicle)
                action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker,
                                                               param_noise_attacker)
            else:
                action_vehicle = torch.Tensor(
                    [policy_vehicle.predict(state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()])
                action_attacker = torch.Tensor([policy_attacker.predict(state.reshape(-1, 4)) / policy_attacker.predict(
                    state.reshape(-1, 4)).sum()])
            if is_cuda:
                ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0]
            else:
                ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0]
            next_state, reward, done = env.step(ac_v, ac_a)
            total_numsteps += 1
            episode_reward += reward

            memory_SL_vehicle.append(state, ac_v)
            memory_SL_attacker.append(state, ac_a)

            action_vehicle = torch.Tensor(action_vehicle)
            action_attacker = torch.Tensor(action_attacker)

            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])

            reward_vehicle = torch.Tensor([reward])
            reward_attacker = torch.Tensor([env.RC - reward])
            memory_vehicle.push(torch.Tensor([[state]]), action_vehicle, mask, next_state, reward_vehicle)
            memory_attacker.push(torch.Tensor([[state]]), action_attacker, mask, next_state, reward_attacker)

            state = next_state.numpy()[0][0]

            if done:
                rewards.append(episode_reward)
                if i_episode % 100:
                    print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward))
                break

        if len(memory_vehicle) > args.batch_size:  # 开始训练
            # print('begin training')
            for _ in range(args.updates_per_step):
                transitions_vehicle = memory_vehicle.sample(args.batch_size)
                batch_vehicle = Transition(*zip(*transitions_vehicle))

                transitions_attacker = memory_attacker.sample(args.batch_size)
                batch_attacker = Transition(*zip(*transitions_attacker))

                trans_veh = memory_SL_vehicle.sample(args.batch_size)
                trans_att = memory_SL_attacker.sample(args.batch_size)

                states_veh = []
                actions_veh = []
                states_att = []
                actions_att = []
                for sample in trans_veh:
                    state_veh, act_veh = sample
                    states_veh.append(state_veh)
                    actions_veh.append(act_veh)
                for sample in trans_att:
                    state_att, act_att = sample
                    states_att.append(state_att)
                    actions_att.append(act_att)

                states_veh = np.reshape(states_veh, (-1, env.observation_space))
                states_att = np.reshape(states_att, (-1, env.observation_space))
                actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space))
                actions_att = np.reshape(actions_att, (-1, env.attacker_action_space))

                policy_vehicle.fit(states_veh, actions_veh, verbose=False)
                policy_attacker.fit(states_att, actions_att, verbose=False)
                value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle)
                value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker)

                # writer.add_scalar('loss/value', value_loss, updates)
                # writer.add_scalar('loss/policy', policy_loss, updates)

                updates += 1

        if i_episode % 10 == 0:
            state = env.reset()
            evaluate_reward = 0
            while True:
                if random.random() < ETA:
                    action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle,
                                                                 param_noise_vehicle)
                    action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker,
                                                                   param_noise_attacker)
                else:
                    action_vehicle = torch.Tensor([policy_vehicle.predict(
                        state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()])
                    action_attacker = torch.Tensor([policy_attacker.predict(
                        state.reshape(-1, 4)) / policy_attacker.predict(state.reshape(-1, 4)).sum()])
                if is_cuda:
                    ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0]
                else:
                    ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0]
                next_state, reward, done = env.step(ac_v, ac_a)
                total_numsteps += 1
                evaluate_reward += reward

                state = next_state[0]
                if done:
                    average_reward = np.mean(rewards[-10:])
                    print("{} % Episode finished, total numsteps: {}, reward: {}, average reward: {}".format(
                        i_episode / args.num_episodes * 100,
                        total_numsteps,
                        evaluate_reward,
                        average_reward))
                    eva_reward.append(evaluate_reward)
                    ave_reward.append(average_reward)
                    print(ac_v[0])
                    eva_ac_veh.append((ac_v[0] + 1) / sum(ac_v[0] + 1))
                    eva_ac_att.append((ac_a[0] + 1) / sum(ac_a[0] + 1))
                    break
            # writer.add_scalar('reward/test', episode_reward, i_episode)
    env.close()
    f = plt.figure()
    plt.plot(eva_reward, label='Eva_reward')
    plt.plot(ave_reward, label='Tra_ave_reward')
    plt.legend()
    plt.show()
    AC_veh = np.array(eva_ac_veh)
    AC_att = np.array(eva_ac_att)
    # print(AC_veh.shape)
    # print(AC_veh)
    plt.plot(AC_veh[:, 0], label='Bacon1')
    plt.plot(AC_veh[:, 1], label='Bacon2')
    plt.plot(AC_veh[:, 2], label='Bacon3')
    plt.plot(AC_veh[:, 3], label='Bacon4')
    # plt.plot(ave_reward, label='Tra_ave_reward')
    plt.legend()
    plt.savefig('Veh_result.png', ppi=300)
    plt.show()
コード例 #6
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch X-job')
    parser.add_argument('--env_name',
                        default="Pendulum-v0",
                        help='name of the environment')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        help='discount factor for model (default: 0.001)')
    parser.add_argument('--ou_noise', type=bool, default=True)
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.4,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.4)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=33,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    parser.add_argument('--seed',
                        type=int,
                        default=4,
                        metavar='N',
                        help='random seed (default: 4)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=200,
                        metavar='N',
                        help='batch size (default: 512)')
    parser.add_argument('--num_steps',
                        type=int,
                        default=100,
                        metavar='N',
                        help='max episode length (default: 300)')
    parser.add_argument('--num_episodes',
                        type=int,
                        default=5000,
                        metavar='N',
                        help='number of episodes (default: 5000)')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='hidden size (default: 128)')
    parser.add_argument('--updates_per_step',
                        type=int,
                        default=5,
                        metavar='N',
                        help='model updates per simulator step (default: 50)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--save_agent',
                        type=bool,
                        default=True,
                        help='save model to file')
    parser.add_argument('--train_model',
                        type=bool,
                        default=True,
                        help='Training or run')
    parser.add_argument('--load_agent',
                        type=bool,
                        default=False,
                        help='load model from file')
    parser.add_argument('--load_exp',
                        type=bool,
                        default=False,
                        help='load saved experience')
    parser.add_argument('--greedy_steps',
                        type=int,
                        default=10,
                        metavar='N',
                        help='amount of times greedy goes (default: 10)')

    args = parser.parse_args()

    env = ManipulateEnv()
    #env = gym.make(args.env_name)
    writer = SummaryWriter('runs/')

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # -- initialize agent --
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                env.observation_space.shape[0], env.action_space)

    # -- declare memory buffer and random process N
    memory = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None

    # -- load existing model --
    if args.load_agent:
        agent.load_model(args.env_name, args.batch_size, args.num_episodes,
                         '.pth')
        print("agent: naf_{}_{}_{}_{}, is loaded".format(
            args.env_name, args.batch_size, args.num_episodes, '.pth'))

    # -- load experience buffer --
    if args.load_exp:
        with open(
                '/home/quantao/Workspaces/catkin_ws/src/panda_demos/naf_env/src/exp_replay.pk1',
                'rb') as input:
            memory.memory = pickle.load(input)
            memory.position = len(memory)

    rewards = []
    total_numsteps = 0
    updates = 0

    #env.init_ros()
    #env.reset()

    t_start = time.time()

    for i_episode in range(args.num_episodes + 1):
        # -- reset environment for every episode --
        #state = env.reset()
        state = torch.Tensor([env.reset()])

        # -- initialize noise (random process N) --
        if args.ou_noise:
            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end - i_episode / args.exploration_end +
                args.final_noise_scale)
            ounoise.reset()

        episode_reward = 0
        while True:
            # -- action selection, observation and store transition --
            action = agent.select_action(
                state,
                ounoise) if args.train_model else agent.select_action(state)

            next_state, reward, done, info = env.step(action)

            #env.render()
            total_numsteps += 1
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])
            next_state = torch.Tensor([next_state])

            #print('reward:', reward)
            memory.push(state, action, mask, next_state, reward)

            state = next_state

            #else:
            #    time.sleep(0.005)

            #env.render()
            #time.sleep(0.005)
            #env.rate.sleep()

            if done or total_numsteps % args.num_steps == 0:
                break

        if len(memory) >= args.batch_size and args.train_model:
            env.reset()
            print("Training model")

            for _ in range(args.updates_per_step * args.num_steps):
                transitions = memory.sample(args.batch_size)
                batch = Transition(*zip(*transitions))
                value_loss, policy_loss = agent.update_parameters(batch)

                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)

                updates += 1
        writer.add_scalar('reward/train', episode_reward, i_episode)
        print("Train Episode: {}, total numsteps: {}, reward: {}".format(
            i_episode, total_numsteps, episode_reward))

        rewards.append(episode_reward)

        greedy_numsteps = 0
        if i_episode % 10 == 0:
            #state = env.reset()
            state = torch.Tensor([env.reset()])

            episode_reward = 0
            while True:
                action = agent.select_action(state)

                next_state, reward, done, info = env.step(action)
                episode_reward += reward
                greedy_numsteps += 1

                #state = next_state
                state = torch.Tensor([next_state])

                #env.render()
                #time.sleep(0.01)
                #   env.rate.sleep()

                if done or greedy_numsteps % args.num_steps == 0:
                    break

            writer.add_scalar('reward/test', episode_reward, i_episode)

            rewards.append(episode_reward)
            print(
                "Episode: {}, total numsteps: {}, reward: {}, average reward: {}"
                .format(i_episode, total_numsteps, rewards[-1],
                        np.mean(rewards[-10:])))

    #-- saves model --
    if args.save_agent:
        agent.save_model(args.env_name, args.batch_size, args.num_episodes,
                         '.pth')
        with open('exp_replay.pk1', 'wb') as output:
            pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)

    print('Training ended after {} minutes'.format(
        (time.time() - t_start) / 60))
    print('Time per episode: {} s'.format(
        (time.time() - t_start) / args.num_episodes))
    print('Mean reward: {}'.format(np.mean(rewards)))
    print('Max reward: {}'.format(np.max(rewards)))
    print('Min reward: {}'.format(np.min(rewards)))
コード例 #7
0
def fit_nash():
    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space)
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space)

    policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space)
    policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space)

    memory_vehicle = ReplayMemory(1000000)
    memory_attacker = ReplayMemory(1000000)

    memory_SL_vehicle = ReplayMemory(100000)
    memory_SL_attacker = ReplayMemory(100000)

    ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                 desired_action_stddev=args.noise_scale,
                                                 adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                  desired_action_stddev=args.noise_scale,
                                                  adaptation_coefficient=1.05) if args.param_noise else None

    rewards = []
    eva_reward = []
    ave_reward = []
    tra_ac_veh = []
    tra_ac_att = []
    All_reward=[]
    total_numsteps = 0
    updates = 0
    state_record = [env.reset()]
    # while len(state_record) < 20:
    #     s, _, _ = env.step(*env.random_action())
    #     state_record.append(s)
    # print(torch.Tensor([state_record[-20:]]).shape)
    for i_episode in range(args.num_episodes):
        local_steps = 0
        state = env.reset()
        state_record = [np.array([state])]
        episode_steps = 0
        while len(state_record) < 20:
            a, b = env.random_action()
            s, _, _ = env.step(np.array([a]), np.array([b]))
            local_steps += 1
            state_record.append(s)
        if args.ou_noise:
            ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_vehicle.reset()

            ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                       i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_attacker.reset()
        episode_reward = 0
        local_steps = 0
        while True:
            if random.random() < ETA:
                # print(state_record[-20:])
                # print('rl', torch.Tensor(state_record[-20:]).shape)
                action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                             param_noise_vehicle)[:, -1, :]
                # print('rl', action_vehicle.shape)
                action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                               param_noise_attacker)[:, -1, :]
                # print('rl', action_vehicle.shape)
            else:
                action_vehicle = torch.Tensor(
                    [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                action_attacker = torch.Tensor(
                    [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                # print('sl', action_vehicle.shape)
                # print('sl', action_vehicle.shape)
            if is_cuda:
                ac_v, ac_a = action_vehicle.cpu().numpy(), action_attacker.cpu().numpy()[0]
            else:
                ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy()
            next_state, reward, done = env.step(ac_v, ac_a)
            # print('tra_reward', reward)
            # print(np.shape(state_record), next_state[0].shape)
            state_record.append(next_state)
            local_steps += 1
            total_numsteps += 1
            episode_steps += 1
            episode_reward += reward
            # print('sl-mem',state.shape,ac_v.shape)
            # print('sl state mem', state.shape, ac_a.shape)
            memory_SL_vehicle.append(state_record[-1], ac_v)
            memory_SL_attacker.append(state_record[-1], ac_a)

            action_vehicle = torch.Tensor(action_vehicle)
            action_attacker = torch.Tensor(action_attacker)

            mask = torch.Tensor([not done])

            prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1)
            next_state = torch.Tensor([next_state])
            # print(prev_state.shape, next_state.shape)
            reward_vehicle = torch.Tensor([reward])
            reward_attacker = torch.Tensor([env.RC - reward])
            # print(state_record[-20:])
            # print(torch.Tensor([state_record[-20:]]).shape)
            memory_vehicle.push(prev_state, action_vehicle, mask, next_state, reward_vehicle)
            memory_attacker.push(prev_state, action_attacker, mask, next_state, reward_attacker)

            state = next_state.numpy()[0]
            # print(state_record[-1].shape)

            if done:
                rewards.append(episode_reward)
                if i_episode % 100:
                    print('Episode {} ends, local_steps {}. total_steps {}, instant ave-reward is {:.4f}'.format(
                        i_episode, local_steps, total_numsteps, episode_reward))

                break

        if len(memory_vehicle) > args.batch_size:  # 开始训练
            # print('begin training')
            for _ in range(args.updates_per_step):
                transitions_vehicle = memory_vehicle.sample(args.batch_size)
                batch_vehicle = Transition(*zip(*transitions_vehicle))

                transitions_attacker = memory_attacker.sample(args.batch_size)
                batch_attacker = Transition(*zip(*transitions_attacker))
                # print(batch_vehicle)

                trans_veh = memory_SL_vehicle.sample(args.batch_size)
                trans_att = memory_SL_attacker.sample(args.batch_size)

                states_veh = []
                actions_veh = []
                states_att = []
                actions_att = []
                for sample in trans_veh:
                    state_veh, act_veh = sample
                    states_veh.append(state_veh)
                    actions_veh.append(act_veh)
                for sample in trans_att:
                    state_att, act_att = sample
                    states_att.append(state_att)
                    actions_att.append(act_att)

                states_veh = np.reshape(states_veh, (-1, env.observation_space))
                states_att = np.reshape(states_att, (-1, env.observation_space))
                actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space))
                actions_att = np.reshape(actions_att, (-1, env.attacker_action_space))

                policy_vehicle.fit(states_veh, actions_veh, verbose=False)
                policy_attacker.fit(states_att, actions_att, verbose=False)
                value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle)
                value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker)

                # writer.add_scalar('loss/value', value_loss, updates)
                # writer.add_scalar('loss/policy', policy_loss, updates)

                updates += 1

        if i_episode % 10 == 0 and i_episode > 0:
            state = env.reset()
            state_record = [np.array([state])]
            while len(state_record) < 20:
                a, b = env.random_action()
                s, _, _ = env.step(np.array([a]), np.array([b]))
                local_steps += 1
                state_record.append(s)
            evaluate_reward = 0
            while True:
                # la = np.random.randint(0, len(state_record) - 20, 1)[0]
                if random.random() < ETA:
                    action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]),
                                                                 ounoise_vehicle,
                                                                 param_noise_vehicle)[:, -1, :]
                    action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]),
                                                                   ounoise_attacker,
                                                                   param_noise_attacker)[:, -1, :]
                else:
                    action_vehicle = torch.Tensor([policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                    action_attacker = torch.Tensor([policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)) / policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy()
                next_state, reward, done = env.step(ac_v, ac_a)
                real_ac_v = ac_v[0].clip(-1, 1) + 1
                tra_ac_veh.append(real_ac_v / (sum(real_ac_v) + 0.0000001))
                tra_ac_att.append(ac_a[0])
                state_record.append(next_state)
                total_numsteps += 1
                local_steps += 1
                # print('eva_reward', reward)
                evaluate_reward += reward

                state = next_state[0]
                if done:
                    average_reward = np.mean(rewards[-10:])
                    print("{} % Episode finished, total numsteps: {}, eva-reward: {}, average reward: {}".format(
                        i_episode / args.num_episodes * 100,
                        total_numsteps,
                        evaluate_reward,
                        average_reward))
                    eva_reward.append(evaluate_reward)
                    ave_reward.append(average_reward)
                    # print(ac_v[0])
                    break
            # writer.add_scalar('reward/test', episode_reward, i_episode)
    env.close()
    df = pd.DataFrame()
    df['Eva'] = pd.Series(eva_reward)
    df['Tra'] = pd.Series(ave_reward)
    df2 = pd.DataFrame()
    df2['Weight'] = pd.Series(tra_ac_veh)
    df2['Attack'] = pd.Series(tra_ac_att)
    df.to_csv('./Result/reward_result_30.csv', index=None)
    df2.to_csv('./Result/action_result_30.csv', index=None)
    # np.savetxt('./Result/eva_result.csv', eva_reward, delimiter=',')
    # np.savetxt('./Result/ave_result.csv', ave_reward, delimiter=',')

    f = plt.figure()
    plt.plot(rewards[5:], label='Eva_reward')
    plt.show()
    AC_veh = np.array(tra_ac_veh)
    AC_att = np.array(tra_ac_att)
    # print(AC_veh.shape)
    # print(AC_veh)
    plt.plot(AC_veh[:, 0], label='Bacon1', alpha=0.2)
    plt.plot(AC_veh[:, 1], label='Bacon2', alpha=0.2)
    plt.plot(AC_veh[:, 2], label='Bacon3', alpha=0.2)
    plt.plot(AC_veh[:, 3], label='Bacon4', alpha=0.2)
    # plt.plot(ave_reward, label='Tra_ave_reward')
    plt.legend()
    plt.savefig('./Result/Veh_result_30.png', ppi=300)
    plt.show()
    # print(AC_veh.shape)
    # print(AC_veh)
    plt.plot(AC_att[:, 0], label='Attack1', alpha=0.2)
    plt.plot(AC_att[:, 1], label='Attack2', alpha=0.2)
    plt.plot(AC_att[:, 2], label='Attack3', alpha=0.2)
    plt.plot(AC_att[:, 3], label='Attack4', alpha=0.2)
    # plt.plot(ave_reward, label='Tra_ave_reward')
    # plt.title('')
    plt.legend()
    plt.savefig('./Result/Att_result_30.png', ppi=300)
    plt.show()
コード例 #8
0
def main():
    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.vehicle_action_space)
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space)


    vehicle_memory = ReplayMemory(1000000)
    attacker_memory = ReplayMemory(1000000)


    vehicle_ounoise = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    attacker_ounoise = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=args.noise_scale,
                                         adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=args.noise_scale,
                                         adaptation_coefficient=1.05) if args.param_noise else None

    rewards = []
    total_numsteps = 0
    updates = 0

    for i_episode in range(args.num_episodes):
        state = torch.Tensor([[env.reset()]])  # 4-dimensional velocity observation

        if args.ou_noise:
            vehicle_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                              i_episode) / args.exploration_end + args.final_noise_scale
            vehicle_ounoise.reset()

            attacker_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            attacker_ounoise.reset()

        episode_reward = 0

        while True:
            action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle)
            action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker)

            next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0])
            total_numsteps += 1
            episode_reward += reward

            action_vehicle = torch.Tensor(action_vehicle)
            action_attacker = torch.Tensor(action_attacker)

            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])

            reward_vehicle = torch.Tensor([-reward])
            reward_attacker = torch.Tensor([env.RC+reward])

            vehicle_memory.push(state, action_vehicle, mask, next_state, reward_vehicle)
            attacker_memory.push(state, action_attacker, mask, next_state, reward_attacker)

            state = next_state

            if len(vehicle_memory) > args.batch_size:
                for _ in range(args.updates_per_step):
                    transitions_vehicle = vehicle_memory.sample(args.batch_size)
                    batch_vehicle = Transition(*zip(*transitions_vehicle))

                    transition_attacker = attacker_memory.sample(args.batch_size)
                    batch_attacker = Transition(*zip(*transition_attacker))

                    value_loss_1, policy_loss_1 = agent_vehicle.update_parameters(batch_vehicle)
                    value_loss_2, policy_loss_2 = agent_attacker.update_parameters(batch_attacker)

                    # writer.add_scalar('loss/value', value_loss, updates)
                    # writer.add_scalar('loss/policy', policy_loss, updates)

                    updates += 1

            if done:
                break

        # writer.add_scalar('reward/train', episode_reward, i_episode)

        # Update param_noise based on distance metric
        if args.param_noise:
            episode_transitions_vehicle = vehicle_memory.memory[vehicle_memory.position - t:vehicle_memory.position]
            states_vehicle = torch.cat([transition[0] for transition in episode_transitions_vehicle], 0)
            unperturbed_actions_vehicle = agent_vehicle.select_action(states_vehicle, None, None)
            perturbed_actions_vehicle = torch.cat([transition[1] for transition in episode_transitions_vehicle], 0)

            ddpg_dist_vehicle = ddpg_distance_metric(perturbed_actions_vehicle.numpy(), unperturbed_actions_vehicle.numpy())
            param_noise_vehicle.adapt(ddpg_dist_vehicle)

            episode_transitions_attacker = attacker_memory.memory[attacker_memory.position - t:attacker_memory.position]
            states_attacker = torch.cat([transition[0] for transition in episode_transitions_attacker], 0)
            unperturbed_actions_attacker = agent_attacker.select_action(states_attacker, None, None)
            perturbed_actions_attacker = torch.cat([transition[1] for transition in episode_transitions_attacker], 0)

            ddpg_dist_attacker = ddpg_distance_metric(perturbed_actions_attacker.numpy(), unperturbed_actions_attacker.numpy())
            param_noise_attacker.adapt(ddpg_dist_attacker)

        rewards.append(episode_reward)

        if i_episode % 10 == 0:
            state = torch.Tensor([[env.reset()]])
            episode_reward = 0
            while True:
                action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle)
                action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker)

                next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0])
                episode_reward += reward

                next_state = torch.Tensor([[next_state]])

                state = next_state
                if done:
                    break

            # writer.add_scalar('reward/test', episode_reward, i_episode)

            rewards.append(episode_reward)
            print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps,
                                                                                           rewards[-1],
                                                                                           np.mean(rewards[-10:])))

    env.close()
コード例 #9
0
def fit_nash():
    suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode)
    # reward_file = open('reward' + suffix + '.txt', 'w')
    # attack_file = open('attacker_action' + suffix + '.txt', 'w')
    # weight_file = open('vehicle_weight' + suffix + '.txt', 'w')
    # distance_file = open('Distance' + suffix + '.txt', 'w')



#     reward_file.write("""
# Environment Initializing...
# The initial head car velocity is {}
# The initial safe distance is     {}
# The Nash Eq* Factor RC is        {}
# The Reward Calculation Mode is   {}
# The Attack Mode is               {}
# The Nash Mode is                 {}
# """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash))

    # reward_file.close()
    # attack_file.close()
    # weight_file.close()
    # distance_file.close()

    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space, 'veh')
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space, 'att')
    try:
        agent_vehicle.load_model('models/vehicle_' + suffix)
        print('Load vehicle RL model successfully')

    except:
        print('No existed vehicle RL model')
    try:
        agent_attacker.load_model('models/attacker_' + suffix)
        print('Load attacker RL model successfully')

    except:
        print('No existed attacker RL model')
    try:
        policy_vehicle = load_model('models/vehicle_' + suffix + '.h5')
        print('Load vehicle SL model successfully')
    except:
        policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle')
    try:
        policy_attacker = load_model('models/attacker_' + suffix + '.h5')
        print('Load attacker SL model successfully')
    except:
        policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker')
    print('*'*20, '\n\n\n')
    memory_vehicle = ReplayMemory(100000)
    memory_attacker = ReplayMemory(100000)

    memory_SL_vehicle = ReplayMemory(400000)
    memory_SL_attacker = ReplayMemory(400000)

    ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                 desired_action_stddev=args.noise_scale,
                                                 adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                  desired_action_stddev=args.noise_scale,
                                                  adaptation_coefficient=1.05) if args.param_noise else None
    res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance'])
    reward_data = pd.DataFrame(columns=['Reward'])
    rewards = []
    total_numsteps = 0
    for i_episode in range(args.num_episodes):
        if i_episode % 100 == 0 and i_episode != 0:
            print('Writing to CSV files...')
            reward_data.to_csv(suffix + '.csv', index=False)
            res_data.to_csv(suffix + '.csv', index=False)

        if args.NashMode == 0:
            ETA = 0
        elif args.NashMode == 1:
            ETA = 0.5
        elif args.NashMode == 2:
            ETA = 0.1 - i_episode/args.num_episodes * 0.1

        print('No.{} episode starts... ETA is {}'.format(i_episode, ETA))

        # reward_file = open('reward' + suffix + '.txt', 'a')
        # attack_file = open('attacker_action' + suffix + '.txt', 'a')
        # weight_file = open('vehicle_weight' + suffix + '.txt', 'a')
        # distance_file = open('Distance' + suffix + '.txt', 'a')

        local_steps = 0
        state = env.reset()
        state_record = [np.array([state])]
        episode_steps = 0
        while len(state_record) < 20:
            a, b = env.random_action()
            s, _, _ = env.step(np.array([a]), np.zeros(4))
            local_steps += 1
            state_record.append(s)
        if args.ou_noise:
            ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_vehicle.reset()

            ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                       i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_attacker.reset()
        episode_reward = 0
        local_steps = 0
        while True:
            sigma = random.random()
            if sigma > ETA:
                # print(state_record[-20:])
                # print('rl', torch.Tensor(state_record[-20:]).shape)
                action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                             param_noise_vehicle)[:, -1, :]
                # print('rl', action_vehicle.shape)
                action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                               param_noise_attacker)[:, -1, :]
                # print('rl', action_vehicle.shape)
            else:
                action_vehicle = torch.Tensor(
                    [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                action_attacker = torch.Tensor(
                    [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]

            # 限制权重和为1
            action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum())
            action_attacker = action_attacker.numpy()[0]

            next_state, reward, done = env.step(action_vehicle, action_attacker)
            res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}])

            # 将处理的攻击值赋给原值
            action_attacker = env.action_attacker

            total_numsteps += 1
            episode_reward += reward

            state_record.append(next_state)
            local_steps += 1
            episode_steps += 1

            if sigma > ETA:
                memory_SL_vehicle.append(state_record[-1], action_vehicle)
                memory_SL_attacker.append(state_record[-1], action_attacker)

            action_vehicle = torch.Tensor(action_vehicle.reshape(1,4))
            action_attacker = torch.Tensor(action_attacker.reshape(1,4))

            mask = torch.Tensor([not done])
            prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1)
            next_state = torch.Tensor([next_state])

            reward_vehicle = torch.Tensor([reward])
            reward_attacker = torch.Tensor([RC - reward])

            memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle)
            memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker)

            if done:
                rewards.append(episode_reward)
                print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward))
                reward_data = reward_data.append([{'Reward': episode_reward}])
                    # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward))
                break

        if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size:  # 开始训练
            for _ in range(args.updates_per_step):
                transitions_vehicle = memory_vehicle.sample(args.batch_size)
                batch_vehicle = Transition(*zip(*transitions_vehicle))

                transitions_attacker = memory_attacker.sample(args.batch_size)
                batch_attacker = Transition(*zip(*transitions_attacker))

                trans_veh = memory_SL_vehicle.sample(args.batch_size)
                trans_att = memory_SL_attacker.sample(args.batch_size)

                states_veh = []
                actions_veh = []
                states_att = []
                actions_att = []
                for sample in trans_veh:
                    state_veh, act_veh = sample
                    states_veh.append(state_veh)
                    actions_veh.append(act_veh)
                for sample in trans_att:
                    state_att, act_att = sample
                    states_att.append(state_att)
                    actions_att.append(act_att)

                states_veh = np.reshape(states_veh, (-1, env.observation_space))
                states_att = np.reshape(states_att, (-1, env.observation_space))
                actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space))
                actions_att = np.reshape(actions_att, (-1, env.attacker_action_space))

                policy_vehicle.fit(states_veh, actions_veh, verbose=False)
                policy_attacker.fit(states_att, actions_att, verbose=False)
                agent_vehicle.update_parameters(batch_vehicle)
                agent_attacker.update_parameters(batch_attacker)

                # writer.add_scalar('loss/value', value_loss, updates)
                # writer.add_scalar('loss/policy', policy_loss, updates)

        if i_episode % 10 == 0 and i_episode != 0:

            eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance'])


            # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode))
            state = env.reset()
            state_record = [np.array([state])]
            evaluate_reward = 0
            while len(state_record) < 20:
                a, b = env.random_action()
                s, _, _ = env.step(np.array([a]), np.zeros(4))
                local_steps += 1
                state_record.append(s)
            while True:
                if random.random() < ETA:
                    action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                                 param_noise_vehicle)[:, -1, :]
                    # print('rl', action_vehicle.shape)
                    action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                                   param_noise_attacker)[:, -1, :]
                else:
                    action_vehicle = torch.Tensor(
                        [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                            state_record[-1].reshape(-1, 4)).sum()])[0]
                    action_attacker = torch.Tensor(
                        [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0]

                action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum()
                action_attacker = action_attacker.numpy()[0]
                next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2)

                eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}])
                evaluate_reward += reward


                if done:
                    print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode,
                                                                                                   total_numsteps,
                                                                                                   evaluate_reward,
                                                                                                   np.mean(rewards[-10:])))
                    # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode,
                    #                                                                                              total_numsteps,
                    #                                                                                              evaluate_reward,
                    #                                                                                              np.mean(rewards[-10:])))
                    break
        #         # writer.add_scalar('reward/test', episode_reward, i_episode)
        # reward_file.close()
        # attack_file.close()
        # weight_file.close()
        # distance_file.close()
    env.close()
    reward_data.to_csv(suffix+'_reward.csv', index=False)
    res_data.to_csv(suffix+'.csv', index=False)
    eva_res_data.to_csv(suffix+'_eva.csv', index=False)

    # save model
    agent_vehicle.save_model('vehicle_'+suffix)
    agent_attacker.save_model('attacker_'+suffix)

    policy_attacker.save('models/attacker_'+suffix+'.h5')
    policy_vehicle.save('models/vehicle_'+suffix+'.h5')
コード例 #10
0
def main():
    cfg = ConfigParser()
    cfg.read('config.ini')

    IP = cfg.get('server', 'ip')
    PORT = cfg.getint('server', 'port')
    FILE = cfg.get('file', 'file')
    SIZE = cfg.getint('env', 'buffer_size')
    TIME = cfg.getfloat('env', 'time')
    EPISODE = cfg.getint('env', 'episode')

    parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')

    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        metavar='G',
                        help='discount factor for model (default: 0.001)')

    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.3)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')

    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='number of hidden size (default: 128)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--updates_per_step',
                        type=int,
                        default=5,
                        metavar='N',
                        help='model updates per simulator step (default: 5)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='batch size (default: 128)')

    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.connect((IP, PORT))
    fd = sock.fileno()
    my_env = env(fd=fd, buff_size=SIZE, time=TIME, k=8, l=0.01, n=0.03, p=0.05)
    mpsched.persist_state(fd)

    args = parser.parse_args()
    agent = NAF_CNN(args.gamma, args.tau, args.hidden_size,
                    my_env.observation_space.shape[0], my_env.action_space)
    memory = ReplayMemory(args.replay_size)
    ounoise = OUNoise(my_env.action_space.shape[0])

    rewards = []
    times = []
    for i_episode in range(EPISODE):
        if (i_episode < 0.9 * EPISODE):  # training
            io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE)
            io.start()

            state = my_env.reset()

            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end -
                i_episode) / args.exploration_end + args.final_noise_scale
            ounoise.reset()
            print(state)
            episode_reward = 0
            while True:
                state = torch.FloatTensor(state)
                #print("state: {}\n ounoise: {}".format(state, ounoise.scale))
                action = agent.select_action(state, ounoise)
                #print("action: {}".format(action))
                next_state, reward, count, recv_buff_size, done = my_env.step(
                    action)
                #print("buff size: ",recv_buff_size)
                #print("reward: ", reward)
                episode_reward += reward

                action = torch.FloatTensor(action)
                mask = torch.Tensor([not done])
                next_state = torch.FloatTensor(next_state)
                reward = torch.FloatTensor([float(reward)])
                memory.push(state, action, mask, next_state, reward)

                state = next_state

                if len(memory) > args.batch_size * 5:
                    for _ in range(args.updates_per_step):
                        transitions = memory.sample(args.batch_size)
                        batch = Transition(*zip(*transitions))
                        #print("update",10*'--')
                        agent.update_parameters(batch)

                if done:
                    break
            rewards.append(episode_reward)
            io.join()
        else:  # testing
            io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE)
            io.start()
            state = my_env.reset()
            episode_reward = 0
            start_time = time.time()
            while True:
                state = torch.FloatTensor(state)
                #print("state: {}\n".format(state))
                action = agent.select_action(state)
                #print("action: {}".format(action))
                next_state, reward, count, done = my_env.step(action)
                episode_reward += reward
                state = next_state

                if done:
                    break
            rewards.append(episode_reward)
            times.append(str(time.time() - start_time) + "\n")
            io.join()
        #print("Episode: {}, noise: {}, reward: {}, average reward: {}".format(i_episode, ounoise.scale, rewards[-1], np.mean(rewards[-100:])))
        fo = open("times.txt", "w")
        fo.writelines(lines)
        fo.close()

    sock.close()