コード例 #1
0
def record_model(args_path, weight_path):
    """Record a sequence of episodes from a neural network.
    Args:
        args (dict): Relevant options to play the episodes.
        weight_path (str): Path of the neural network parameters to load.
    Returns:
        dict: Sequence of episode images buffer.
    """
    with open(args_path, 'r') as file_args:
        args = json.load(file_args)
    args['--gui'] = True
    print(args)

    # Experiment options
    episode_count = 5
    width, height = 300, 300
    steps_per_second = 20
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device = 'cpu'

    # We create the environment
    env = make_intrusion_env(args)
    env.seed(args['--train_seed'])

    # We create the actor-critic network
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]
    actor_net = PolicyNetwork(state_dim, action_dim,
                              args['--hidden']).to(device)

    # We load the weights
    checkpoint = torch.load(weight_path, map_location=device)
    actor_net.load_state_dict(checkpoint['actor_model_state_dict'])
    actor_net.eval()

    # We view the model
    data = {}

    with torch.no_grad():
        for episode in range(episode_count):
            done = False
            data['episode_{}'.format(episode)] = []

            state = env.reset(intruder_position=[100, 100])
            episode_reward = 0
            step = 0

            while not done:
                step += 1
                print('step', step)
                img = Image.fromarray(env.render(mode='rgb_array'))
                data['episode_{}'.format(episode)].append(encode_img(img))
                model_state = torch.FloatTensor(state).to(device)
                action = actor_net(model_state).detach().cpu().numpy()
                # action = ou_noise.get_action(action, step)
                state, reward, done, _ = env.step(action)
    env.close()
    return data
コード例 #2
0
def train_model(args):
    seed_experiment(args['--train_seed'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # We create the environment
    env = make_intrusion_env(args)

    args['--port'] = env.link.connector.simu_port
    parser._write_options(args['--exp'], 'exp_options.json', args)

    args["--save_interval_performance"] = 1000
    qdrl_algo = QDRLAlgo(args, device, env)
    qdrl_algo.train_model()
コード例 #3
0
def record_architecture(args):
    """Construct an image of the neural network architecture.

    Args:
        args (dict): Relevant options to construct the image.
    Returns:
        dict: Visualize result of the defined neural network.
    """

    # We create the environment
    env = make_intrusion_env(args)

    # We create the actor-critic network
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]
    actor_net = PolicyNetwork(state_dim, action_dim, args['--hidden'])
    critic_net = ActionValueNetwork(state_dim, action_dim, args['--hidden'])

    # We send the architecture image
    return {
        **actor_net.visualize(state_dim),
        **critic_net.visualize(state_dim, action_dim)
    }
コード例 #4
0
def plot_ac(args, actors, device):
    fig = plt.figure()
    fig.set_dpi(100)
    fig.set_size_inches(7, 6.5)
    number_of_actors = len(actors)

    ax = plt.axes(xlim=(-100, 100), ylim=(-100, 100))
    global done_gv
    done_gv = [False] * number_of_actors
    envs = [make_intrusion_env(args) for _ in range(number_of_actors)]
    global state_gv
    state_gv = [envi.reset() for envi in envs]
    goal = plt.Circle((0, 0), 5, fc='r')
    obstacles = []
    for i, obstacle in enumerate(envs[0].space.obstacle_list):
        obstacles.append(
            plt.Rectangle((obstacle.center.x - obstacle.dimensions.x / 2,
                           obstacle.center.y - obstacle.dimensions.y / 2),
                          obstacle.dimensions.x,
                          obstacle.dimensions.y,
                          fc='g'))

    intruders = [None] * number_of_actors
    for id_env, envi in enumerate(envs):
        for i, intruder_id in envi.action_intruder.items():
            entity_obj = envi.space.get_pedestrian(intruder_id)
            if entity_obj is not None:
                intruders[id_env] = plt.Circle(
                    (entity_obj.geometry.position.x,
                     entity_obj.geometry.position.y),
                    1,
                    fc='C' + str(id_env))

    def init():
        for obs in obstacles:
            ax.add_patch(obs)
        for intruder in intruders:
            ax.add_patch(intruder)
        ax.add_patch(goal)
        return [goal] + obstacles + intruders

    def animate(i):
        global done_gv
        global state_gv

        for id_env, envi in enumerate(envs):
            if not done_gv[id_env]:
                action = actors[id_env](state_gv[id_env])
                next_state, reward, done_, info_ = envi.step(action)
                state_gv[id_env] = next_state
                #   print('done', done_, next_state[0],next_state[1],math.sqrt(((next_state[0]-.5)*100)**2+((next_state[1]-.5)*100)**2))

                done_gv[id_env] = done_
                for i, intruder_id_ in envi.action_intruder.items():
                    entity_obj_ = envi.space.get_pedestrian(intruder_id_)
                    if entity_obj_ is not None:
                        intruders[id_env].center = (
                            entity_obj_.geometry.position.x,
                            entity_obj_.geometry.position.y)
                    else:
                        intruders[id_env].center = (.5, .5)
        return [goal] + obstacles + intruders

    anim = animation.FuncAnimation(fig,
                                   animate,
                                   init_func=init,
                                   frames=360,
                                   interval=20,
                                   blit=True)
    plt.show()
    [envo.close() for envo in envs]
コード例 #5
0
def train_model(args):
    seed_experiment(args['--train_seed'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # We create the environment
    env = make_intrusion_env(args)

    args['--port'] = env.link.connector.simu_port
    parser._write_options(args['--exp'], 'exp_options.json', args)

    args["--save_interval_performance"] = 1000

    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]

    # We create the action noise process for exploration around the behavior policy
    # TODO: handle conditional parsers
    # if args["--noise"] == 'Gaussian':
    #    noise_process_exploration = Gaussian(env.action_space, args['--g_min_sigma'], args['--g_max_sigma'],
    #                                         decay_period=args['--g_decay'])
    if args["--noise"] == 'epsilon_greedy':
        noise_process_exploration = EpsilonGreedy(
            args['--epsilon_start'],
            args['--epsilon_end'],
            args['--decay_period'],
            action_space=env.action_space)

    # We create the value and policy networks as well as their target
    critic_net1, critic_net2, target_critic_net1, target_critic_net2 = [
        ActionValueNetwork(state_dim, action_dim, args['--hidden']).to(device)
        for _ in range(4)
    ]
    actor_net, target_actor_net = (PolicyNetwork(state_dim, action_dim,
                                                 args['--hidden']).to(device),
                                   PolicyNetwork(state_dim, action_dim,
                                                 args['--hidden']).to(device))

    # We create the optimizers
    actor_optimizer = torch.optim.Adam(actor_net.parameters(),
                                       lr=args['--policy_lr'])
    critic1_optimizer = torch.optim.Adam(critic_net1.parameters(),
                                         lr=args['--value_lr'])
    critic2_optimizer = torch.optim.Adam(critic_net2.parameters(),
                                         lr=args['--value_lr'])

    # We initialize the target models to be identical to the other models
    soft_update(critic_net1, target_critic_net1, soft_tau=1.)
    soft_update(critic_net2, target_critic_net2, soft_tau=1.)
    soft_update(actor_net, target_actor_net, soft_tau=1.)

    # We create the replay buffer
    if args["--replay_buffer_kickstart_file"] is not None:
        replay_buffer = ReplayBuffer.load_from_file(
            args["--replay_buffer_kickstart_file"])
    else:
        replay_buffer = ReplayBuffer(args['--buffer'])

    # We create the criterion
    td3_criterion = TD3Criterion(actor_net,
                                 target_actor_net,
                                 critic_net1,
                                 critic_net2,
                                 target_critic_net1,
                                 target_critic_net2,
                                 gamma=args['--gamma'],
                                 soft_tau=args['--soft_tau'],
                                 noise_std=args['--g_smooth_sigma'],
                                 noise_clip=args['--g_smooth_clip'],
                                 device=device)

    # We prepare the experiment
    exp_options = {
        'episode_reward_train': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'episode_reward_test': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'episode_reward_test_sparse': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'episode_reward_test_sparse_ring': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'score_train': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'score_test': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'score_test_sparse': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'score': {
            'plot': 'line',
            'yscale': 'linear'
        },
        'actor_loss': {
            'plot': 'line',
            'yscale': 'log'
        },
        'critic_loss_1': {
            'plot': 'line',
            'yscale': 'log'
        },
        'critic_loss_2': {
            'plot': 'line',
            'yscale': 'log'
        },
    }
    agent_id = 0
    description = 'TD3: {} with {} frames for training'.format(
        args['--env_name'], args['--budget'])
    exp_id = create_experiment(args['--exp'], description, './', exp_options)
    print('exp_id', exp_id)

    storage = ExpLoggerAgent(
        exp_id, agent_id, os.path.join(args['--exp'], 'agent_0'), {
            'critic_model1': critic_net1,
            'critic_model2': critic_net2,
            'actor_model': actor_net
        }, {
            'critic1_optimizer': critic1_optimizer,
            'critic2_optimizer': critic2_optimizer,
            'actor_optimizer ': actor_optimizer
        })
    reward_buffer_test = deque(maxlen=100)
    reward_buffer_test_sparse = deque(maxlen=100)
    reward_buffer_test_sparse_ring = deque(maxlen=100)
    reward_buffer_train = deque(maxlen=100)

    # We train the networks
    step_idx = 0
    episode_idx = 0
    episode_reward_train = 0
    state = env.reset()
    step_idx_in_episode = 0

    while step_idx < args['--budget']:

        actor_net.eval()
        # Do one step in the environment and save information
        model_state = torch.FloatTensor(state).to(device)
        action = actor_net(model_state).detach().cpu().numpy()
        action = noise_process_exploration.get_action(action, t=step_idx)
        next_state, reward, done, _ = env.step(action)
        if not done or step_idx_in_episode != 0:
            replay_buffer.push(state, action, reward, next_state, done)
        episode_reward_train += reward

        # Train/Update the actor and critic based on resampling transitions from the replay buffer
        if step_idx % args['--delay_policy_update'] == 0:
            actor_net.train()
        critic_net1.train()
        critic_net2.train()
        if len(replay_buffer) > args['--batch_size']:
            # Sample from the relay buffer
            state_replay, action_replay, reward_replay, next_state_replay, done_replay = replay_buffer.sample(
                args['--batch_size'])
            # Compute, store and optimize the losses
            critic_loss1, critic_loss2, actor_loss = td3_criterion.loss(
                state_replay, action_replay, reward_replay, next_state_replay,
                done_replay)

            critic1_optimizer.zero_grad()
            critic_loss1.backward(retain_graph=True,
                                  inputs=list(critic_net1.parameters()))
            critic2_optimizer.zero_grad()
            critic_loss2.backward(retain_graph=True,
                                  inputs=list(critic_net2.parameters()))
            if step_idx % args['--delay_policy_update'] == 0:
                actor_optimizer.zero_grad()
                actor_loss.backward(inputs=list(actor_net.parameters()))
            critic1_optimizer.step()
            critic2_optimizer.step()
            if step_idx % args['--delay_policy_update'] == 0:
                actor_optimizer.step()
                soft_update(critic_net1, target_critic_net1,
                            args['--soft_tau'])
                soft_update(critic_net2, target_critic_net2,
                            args['--soft_tau'])
                soft_update(actor_net, target_actor_net, args['--soft_tau'])

        # Save and print performance information
        if step_idx % args[
                "--save_interval_performance"] == 0 and step_idx > 1 and len(
                    reward_buffer_test) > 1:
            storage.performance(
                step_idx, {
                    'critic_loss_1': critic_loss1.item(),
                    'critic_loss_2': critic_loss2.item(),
                    'actor_loss': actor_loss.item()
                })
            mean_reward_train = sum(reward_buffer_train) / len(
                reward_buffer_train)
            mean_reward_test = sum(reward_buffer_test) / len(
                reward_buffer_test)
            mean_reward_test_sparse = sum(reward_buffer_test_sparse) / len(
                reward_buffer_test_sparse)

            storage.performance(step_idx, {'score_train': mean_reward_train})
            storage.performance(step_idx, {'score_test': mean_reward_test})
            storage.performance(step_idx,
                                {'score_test_sparse': mean_reward_test_sparse})
            storage.write()
            print('Loss at {}/{}: value1={:.4}, value2={:.4},  policy={:.4}.'.
                  format(step_idx, args['--budget'], critic_loss1.item(),
                         critic_loss2.item(), actor_loss.item()))
            print('Result train at {}/{}: {}.'.format(step_idx,
                                                      args['--budget'],
                                                      mean_reward_train))
            print('Result test at {}/{}: {}.'.format(step_idx,
                                                     args['--budget'],
                                                     mean_reward_test))

        # save the weights of the model
        if step_idx % args['--save_interval'] == 0:
            storage.state(step_idx)

        # do not forget to update time and state
        step_idx += 1
        step_idx_in_episode += 1
        state = next_state

        if done:  # the episode came to an end.
            episode_idx += 1
            step_idx_in_episode = 0
            storage.performance(step_idx,
                                {'episode_reward_train': episode_reward_train})
            reward_buffer_train.append(episode_reward_train)
            episode_reward_train = 0

            if episode_idx % args["--test_frequency"] == 0:
                # Testing the learned policy on one episode
                actor_net.eval()
                total_number_test = 1
                episode_reward_test = 0
                episode_reward_test_sparse = 0
                episode_reward_test_sparse_ring = 0

                for test_number in range(total_number_test):
                    state_test = env.reset(
                        intruder_position=args['--intruder_position_test'],
                        reward=args['--reward'])
                    episode_reward_test += one_episode(state_test, device,
                                                       actor_net, env)
                for test_number in range(total_number_test):
                    state_test = env.reset(
                        intruder_position=args['--intruder_position_test'],
                        reward='sparse')
                    episode_reward_test_sparse += one_episode(
                        state_test, device, actor_net, env)
                for test_number in range(total_number_test):
                    state_test = env.reset(intruder_position='ring',
                                           reward='sparse')
                    episode_reward_test_sparse_ring += one_episode(
                        state_test, device, actor_net, env)

                normalized_episode_reward_test = episode_reward_test / total_number_test
                normalized_episode_reward_test_sparse = episode_reward_test_sparse / total_number_test
                normalized_episode_reward_test_sparse_ring = episode_reward_test_sparse_ring / total_number_test

                reward_buffer_test.append(normalized_episode_reward_test)
                reward_buffer_test_sparse.append(
                    normalized_episode_reward_test_sparse)
                reward_buffer_test_sparse_ring.append(
                    normalized_episode_reward_test_sparse_ring)
                storage.performance(
                    step_idx, {'episode_reward_test': episode_reward_test})
                storage.performance(
                    step_idx,
                    {'episode_reward_test_sparse': episode_reward_test_sparse})
                storage.performance(
                    step_idx, {
                        'episode_reward_test_sparse_ring':
                        episode_reward_test_sparse_ring
                    })

            state = env.reset(
                intruder_position=args['--intruder_position_train'],
                reward=args['--reward'])

    env.close()

    storage.state(step_idx)
    print('Loss at {}/{}: value1={:.4}, value2={:.4}, policy={:.4}.'.format(
        step_idx, args['--budget'], critic_loss1.item(), critic_loss2.item(),
        actor_loss.item()))
    storage.close()
    stop_experiment(exp_id)
コード例 #6
0
    weight_path= sys.argv[2]

    print('rfji', args_path, weight_path)

    with open(args_path) as json_file:
        args = json.load(json_file)
    # Parse the options
    parser = TrainExperimentParser(ParserIntrusion(ParserIntruder(), ParserGuard(), ParserFixedGuard()),
                                   ParserQDRLIntrusion(ParserEpsilonGreedy()))
    _args = parser.parse()


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # We create the environment
    env = make_intrusion_env(_args)

    # TESTT
    from PIL import Image
    state = env.reset()
    renderer = Renderer(env.unwrapped)

    # We create the actor-critic network
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]
    actor_net = PolicyNetwork(state_dim, action_dim, _args['--hidden']).to(device)

    # We load the weights
    checkpoint = torch.load(weight_path, map_location=device)
    actor_net.load_state_dict(checkpoint['actor_model_state_dict'])
    actor_net.eval()