Beispiel #1
0
def visualize_rewards_from_reward_directory(directory_name, feature_extractor,
                                            env):

    #given a directory name, this function will read each of the reward network
    #from the directory and plots the reward for each of the actions and stores them
    #in different directories based on the action and directory name

    #create parent directory

    dir_name = directory_name.split('/')
    cdw = os.getcwd()

    create_dir_path = os.path.join(cdw, 'experiments/plots/')

    parent_reward_directory = os.path.join(create_dir_path, dir_name[-1])
    try:
        os.mkdir(parent_reward_directory)
    except OSError:
        print("Creation of the directory failed.")
    else:
        print("Successfully created the directory.")

    reward_network_names = glob.glob(os.path.join(directory_name, '*.pt'))

    actions = ['left', 'right', 'up', 'down']

    #create directories for reward plots obtained from each of the actions

    for act in actions:
        action_dir = act
        try:
            os.mkdir(os.path.join(parent_reward_directory, action_dir))
        except OSError:
            print("cant create directory")

    for network_fname in reward_network_names:

        network_number = network_fname.split('/')[-1].split('.')[0]

        reward_network = RewardNet(feat.extract_features(env.reset()).shape[0])
        reward_network.load(network_fname)
        reward_network.eval()
        reward_network.to(DEVICE)

        #run function

        for act in actions:

            dir_to_save = os.path.join(parent_reward_directory, act)
            fname = dir_to_save + '/' + network_number + '.png'
            reward_values = visualize_rewards_in_environment(
                act, env, reward_network, feat)
            plt.figure(act)
            plt.imshow(reward_values)
            plt.colorbar()
            plt.savefig(fname)
            plt.clf()
Beispiel #2
0
def view_reward_from_trajectory(reward_network_folder, trajectory_folder,
                                feature_extractor):
    '''
    given a reward network folder, a trajectory folder and a feature extractor, this plots the 
    rewards given to each of the trajectories by each of the reward networks in the reward network folder

    '''
    hidden_dims = [1024, 256]
    reward_network_names = glob.glob(
        os.path.join(reward_network_folder, '*.pt'))
    trajectories = glob.glob(os.path.join(trajectory_folder, '*.states'))

    rewards_across_traj_model = np.zeros(
        (len(reward_network_names), len(trajectories)))
    reward_counter = 0
    for reward_net in sorted(reward_network_names, key=numericalSort):

        reward_net_model = RewardNet(feature_extractor.state_rep_size,
                                     hidden_dims)
        print("loading reward_net :", reward_net)
        reward_net_model.load(reward_net)
        reward_net_model.eval()
        reward_net_model.to('cuda')
        traj_counter = 0
        for trajectory in trajectories:

            state_list = torch.load(trajectory)
            cur_reward = 0
            for state in state_list:

                reward = reward_net_model(state)
                cur_reward += reward

            rewards_across_traj_model[reward_counter][
                traj_counter] = cur_reward
            traj_counter += 1

        reward_counter += 1

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(111)
    cax = ax2.matshow(rewards_across_traj_model, interpolation='nearest')
    fig2.colorbar(cax)
    plt.show()
    pdb.set_trace()

    return rewards_across_traj_model
Beispiel #3
0
def main():
    r = 10
    c = 10
    env = GridWorld(display=False, obstacles=[np.asarray([1, 2])])
    reward_network = RewardNet(env.reset().shape[0])
    reward_network.load('./experiments/saved-models-rewards/1.pt')
    reward_network.eval()
    reward_network.to(DEVICE)

    reward_values = getperStateReward(reward_network, rows=10, cols=10)

    irl_reward_valies = per_state_reward(reward_network, r, c)

    pdb.set_trace()

    plt.imshow(reward_values)
    plt.colorbar()
    plt.show()
Beispiel #4
0
def main():
    args = parser.parse_args()

    if args.render:
        from envs.gridworld import GridWorld
    else:
        from envs.gridworld_clockless import GridWorldClockless as GridWorld

    env = GridWorld(display=args.render,
                    obstacles=[np.asarray([1, 2])],
                    goal_state=np.asarray([5, 5]),
                    step_wrapper=step_wrapper,
                    reset_wrapper=reset_wrapper,
                    seed=3)
    loss_t = LBT(list_size=100, stop_threshold=1.5, log_interval=100)
    model = ActorCritic(env,
                        gamma=0.99,
                        log_interval=200,
                        max_episodes=5000,
                        max_ep_length=20,
                        termination=loss_t)

    if args.policy_path is not None:
        model.policy.load(args.policy_path)

    if args.reward_net is not None:
        reward_net = RewardNet(env.reset().shape[0])
        reward_net.to('cuda')
        reward_net.load('./saved-models-rewards/0.pt')
        reward_net.eval()
    else:
        reward_net = None

    if not args.play:
        model.train_mp(n_jobs=4, reward_net=reward_net, irl=args.irl)

        if not args.dont_save:
            model.policy.save('./saved-models/')

    if args.play:
        env.tickSpeed = 15
        assert args.policy_path is not None, 'pass a policy to play from!'

        model.generate_trajectory(args.num_trajs, './trajs/ac_gridworld/')
if args.agent_type == 'Default':

    env.external_control = False
    agent = None
    #the person from the video
    pass

#*************************************************
#load reward network if present

if args.reward_path is not None:
    from irlmethods.deep_maxent import RewardNet

    state_size = feat_ext.extract_features(env.reset()).shape[0]
    reward_net = RewardNet(state_size, args.reward_net_hidden_dims)
    reward_net.load(args.reward_path)

#*************************************************
#play


def reward_analysis():
    '''
    A function to analysis the rewards against actions for a given policy.
    A helpful visualization/ debugging tool
    '''
    for i in range(args.num_trajs):

        #reset the world
        state = env.reset()
def main():

    args = parser.parse_args()

    experiment_logger = Logger('temp_save.txt')

    experiment_logger.log_header('Arguments for the experiment :')
    experiment_logger.log_info(vars(args))

    mp.set_start_method('spawn')

    if args.render:
        from envs.gridworld import GridWorld
    else:
        from envs.gridworld_clockless import GridWorldClockless as GridWorld

    agent_width = 10
    step_size = 10
    obs_width = 10
    grid_size = 10

    if args.feat_extractor == 'Onehot':
        feat_ext = OneHot(grid_rows=10, grid_cols=10)
    if args.feat_extractor == 'SocialNav':
        feat_ext = SocialNav(fieldList=['agent_state', 'goal_state'])
    if args.feat_extractor == 'FrontBackSideSimple':
        feat_ext = FrontBackSideSimple(
            thresh1=1,
            thresh2=2,
            thresh3=3,
            thresh4=4,
            step_size=step_size,
            agent_width=agent_width,
            obs_width=obs_width,
        )

    if args.feat_extractor == 'LocalGlobal':
        feat_ext = LocalGlobal(
            window_size=3,
            grid_size=grid_size,
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
        )

    experiment_logger.log_header('Parameters of the feature extractor :')
    experiment_logger.log_info(feat_ext.__dict__)
    '''
    np.asarray([2,2]),np.asarray([7,4]),np.asarray([3,5]),
                                np.asarray([5,2]),np.asarray([8,3]),np.asarray([7,5]),
                                np.asarray([3,3]),np.asarray([3,7]),np.asarray([5,7])
                                '''
    env = GridWorld(display=args.render,
                    is_onehot=False,
                    is_random=True,
                    rows=100,
                    agent_width=agent_width,
                    step_size=step_size,
                    obs_width=obs_width,
                    width=grid_size,
                    cols=100,
                    seed=7,
                    buffer_from_obs=0,
                    obstacles=3,
                    goal_state=np.asarray([5, 5]))

    experiment_logger.log_header('Environment details :')
    experiment_logger.log_info(env.__dict__)

    model = ActorCritic(env,
                        feat_extractor=feat_ext,
                        gamma=0.99,
                        log_interval=100,
                        max_ep_length=40,
                        hidden_dims=args.policy_net_hidden_dims,
                        max_episodes=4000)

    experiment_logger.log_header('Details of the RL method :')
    experiment_logger.log_info(model.__dict__)

    pdb.set_trace()

    if args.policy_path is not None:
        model.policy.load(args.policy_path)

    if not args.play and not args.play_user:
        if args.reward_path is None:
            model.train_mp(n_jobs=4)
        else:
            from irlmethods.deep_maxent import RewardNet
            state_size = featExtract.extract_features(env.reset()).shape[0]
            reward_net = RewardNet(state_size)
            reward_net.load(args.reward_path)
            print(next(reward_net.parameters()).is_cuda)
            model.train_mp(reward_net=reward_net, n_jobs=4)

        if not args.dont_save:
            model.policy.save('./saved-models/')

    if args.play:
        env.tickSpeed = 15
        assert args.policy_path is not None, 'pass a policy to play from!'

        model.generate_trajectory(args.num_trajs,
                                  './trajs/ac_fbs_simple4_static_map7/')

    if args.play_user:
        env.tickSpeed = 200

        model.generate_trajectory_user(args.num_trajs,
                                       './trajs/ac_gridworld_user/')
Beispiel #7
0
def plot_expert_state_reward_across_models(expert_trajectory_folder,
                                           reward_network_folder,
                                           feature_extractor,
                                           save_folder,
                                           max_time_steps=100):
    '''
    A function that helps visualize the evolution of the rewards of the states visited by 
    the expert in the shown demonstrations.
    input : the expert trajectory folder, the folder/file for reward network

    output : bunch of plots showing the rewards for the states 
    '''
    #check if the save folder exist, if not create one

    if os.path.exists(save_folder):
        pass
    else:
        os.makedirs(save_folder)

    #initialize a reward network
    state_size = feature_extractor.state_rep_size
    hidden_dim_list = [128]

    reward_net = RewardNet(state_size, hidden_dims=hidden_dim_list)

    #prepare the array of states visited by the expert
    expert_svf_dict = calculate_expert_svf(expert_trajectory_folder,
                                           max_time_steps=max_time_steps,
                                           feature_extractor=feat_ext,
                                           gamma=1)

    dummy_dict = expert_svf_dict
    for key in dummy_dict.keys():
        dummy_dict[key] = 0

    states, diff = get_states_and_freq_diff(expert_svf_dict, dummy_dict,
                                            feat_ext)
    state_tensors = torch.tensor(states, dtype=torch.float).to(DEVICE)

    #start reading the reward networks from the folder
    reward_network_list = []
    if os.path.isfile(reward_network_folder):
        reward_network_list.append(reward_network_folder)
    if os.path.isdir(reward_network_folder):
        reward_network_list = glob.glob(
            os.path.join(reward_network_folder, '*.pt'))
        reward_network_list = sorted(reward_network_list, key=numericalSort)

    #iterate through all the entries in the list
    bar_width = 0.3
    xaxis = np.arange(len(states))
    network_counter = 0
    for reward_network in reward_network_list:
        fig, ax = plt.subplots(figsize=(20, 3))
        print('Reading from network file :', reward_network)
        reward_net.load(reward_network)
        rewards = reward_net(state_tensors)

        rewards = rewards.detach().cpu().numpy().squeeze()
        ax.bar(xaxis, rewards, width=bar_width)
        file_name = save_folder + str(network_counter) + '.jpg'
        plt.savefig(file_name)
        network_counter += 1
        plt.close()
Beispiel #8
0
def compare_svf(expert_folder,
                agent_policy,
                env_reward=None,
                env=None,
                feat=None):
    '''
    expert folder - folder containing expert trajectories
    agent_policy_folder/policy - a folder or a single policy
    Given these two information, the compare_svf function 
    saves the svf for all the policies which can be used for visual comparison.
    env_reward is the reward network corresponding to the policy network
    '''

    dot_product_loss = []

    environment = env
    state_space = feat.extract_features(environment.reset()).shape[0]

    #plotting for the expert
    expert_svf_dict = calculate_expert_svf(expert_folder,
                                           feature_extractor=feat,
                                           gamma=0.99)
    #pdb.set_trace()
    exp_arr = np.zeros(len(expert_svf_dict.keys()))
    i = 0
    exp_state_key = {}
    for key in expert_svf_dict.keys():
        exp_arr[i] = expert_svf_dict[key]
        exp_state_key[key] = i
        i += 1
    '''####################
    expert = np.squeeze(exp_arr)

    print('The expert shape', expert.shape)

    print('The sum :', np.sum(expert))
    plt.plot(expert)
   
    #plt.show()
    #expert_file_name = expert_folder.split('/')[-2]
    #plt.savefig('./experiments/svf_visual/'+expert_file_name+'.jpg')
    '''###############
    #plotting for the agents

    if os.path.isfile(agent_policy):

        policy = Policy(state_space,
                        environment.action_space.n,
                        hidden_dims=[256])
        policy.load(agent_policy)
        policy.eval()
        policy.to(DEVICE)

        agent_file_name = agent_policy.strip().split('/')[-1].split('.')[0]
        agent_svf_dict = calculate_svf_from_sampling(no_of_samples=500,
                                                     env=environment,
                                                     policy_nn=policy,
                                                     reward_nn=None,
                                                     episode_length=40,
                                                     feature_extractor=feat,
                                                     gamma=.99)
        agent_arr = np.zeros(len(expert_svf_dict.keys()))
        i = 0
        for key in agent_svf_dict.keys():
            if key in exp_state_key.keys():
                agent_arr[exp_state_key[key]] = agent_svf_dict[key]

        agent = np.squeeze(agent_arr)
        #print(np.linalg.norm(np.asarray(diff), 1))
        plt.plot(agent)
        plt.show()

        states, diff = get_states_and_freq_diff(expert_svf_dict,
                                                agent_svf_dict, feat)
        #pdb.set_trace()

        plt.plot(diff)
        plt.show()
        #plt.savefig('./experiments/svf_visual/'+agent_file_name+'.jpg')
        #plt.clf()
        print(np.linalg.norm(np.asarray(diff), 1))
    if os.path.isdir(agent_policy):

        #read files from the directory
        model_names = glob.glob(os.path.join(agent_policy, '*.pt'))
        reward_names = glob.glob(os.path.join(env_reward, '*.pt'))
        reward_names = sorted(reward_names, key=numericalSort)
        counter = 0
        for name in sorted(model_names, key=numericalSort):

            #load the policy network
            policy = Policy(state_space,
                            environment.action_space.n,
                            hidden_dims=[256])
            print('Loading file:', name)
            policy.load(name)
            policy.eval()
            policy.to(DEVICE)

            #load the reward network
            reward_net_name = reward_names[counter]
            reward = RewardNet(state_space, hidden_dims=[256])
            print('Loading reward network :', reward_net_name)
            reward.load(reward_net_name)
            reward.eval()
            reward.to(DEVICE)
            counter += 1
            agent_file_name = name.split('/')[-1].split('.')[0]
            agent_svf_dict = calculate_svf_from_sampling(
                no_of_samples=3000,
                env=environment,
                policy_nn=policy,
                reward_nn=reward,
                episode_length=30,
                feature_extractor=feat,
                gamma=.99)
            states, diff = get_states_and_freq_diff(expert_svf_dict,
                                                    agent_svf_dict, feat)

            #pdb.set_trace()

            plt.plot(diff)
            #plt.show()
            #diff_arr = np.zeros(len(expert_svf_dict.keys()))
            plt.savefig('./experiments/results/svf_visual/' + agent_file_name +
                        '.jpg')
            '''
            agent_arr = np.zeros(len(expert_svf_dict.keys()))
            i = 0
            for key in agent_svf_dict.keys():
                if key in exp_state_key.keys():
                    agent_arr[exp_state_key[key]] = agent_svf_dict[key]

            agent = np.squeeze(agent_arr)
            plt.plot(expert, 'r')

            plt.plot(agent, 'b')
            #plt.show()
            plt.savefig('./experiments/svf_visual/'+agent_file_name+'.jpg')
            plt.clf()
            '''
            diff_arr = np.asarray(diff)
            svf_diff = np.linalg.norm(diff, 1)
            print('The SVF diff for this model:', svf_diff)
            dot_product_loss.append(svf_diff)

        plt.plot(dot_product_loss, 'g')
        plt.savefig('./experiments/svf_visual/dot_prod.jpg')
Beispiel #9
0
def main():

    #####for the logger
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
    ###################

    args = parser.parse_args()

    seed_all(args.seed)

    if args.on_server:

        matplotlib.use("Agg")
        # pygame without monitor
        os.environ["SDL_VIDEODRIVER"] = "dummy"

    from matplotlib import pyplot as plt

    mp.set_start_method("spawn")

    from rlmethods.b_actor_critic import ActorCritic
    from rlmethods.soft_ac import SoftActorCritic, QSoftActorCritic
    from rlmethods.rlutils import ReplayBuffer

    from envs.gridworld_drone import GridWorldDrone
    from featureExtractor.drone_feature_extractor import (
        DroneFeatureSAM1,
        DroneFeatureOccup,
        DroneFeatureRisk,
        DroneFeatureRisk_v2,
        VasquezF1,
        VasquezF2,
        VasquezF3,
        Fahad,
        GoalConditionedFahad,
    )
    from featureExtractor.gridworld_featureExtractor import (
        FrontBackSide,
        LocalGlobal,
        OneHot,
        SocialNav,
        FrontBackSideSimple,
    )
    from featureExtractor.drone_feature_extractor import (
        DroneFeatureRisk_speed,
        DroneFeatureRisk_speedv2,
    )

    from featureExtractor.drone_feature_extractor import VasquezF1

    save_folder = None

    if not args.dont_save and not args.play:

        if not args.save_folder:
            print("Provide save folder.")
            exit()

        policy_net_dims = "-policy_net-"
        for dim in args.policy_net_hidden_dims:
            policy_net_dims += str(dim)
            policy_net_dims += "-"

        reward_net_dims = "-reward_net-"
        for dim in args.reward_net_hidden_dims:
            reward_net_dims += str(dim)
            reward_net_dims += "-"

        save_folder = (
            "./results/"
            + args.save_folder
            + st
            + args.feat_extractor
            + "-seed-"
            + str(args.seed)
            + policy_net_dims
            + reward_net_dims
            + "-total-ep-"
            + str(args.total_episodes)
            + "-max-ep-len-"
            + str(args.max_ep_length)
        )

        experiment_logger = Logger(save_folder, "experiment_info.txt")
        experiment_logger.log_header("Arguments for the experiment :")
        repo = git.Repo(search_parent_directories=True)
        experiment_logger.log_info({'From branch : ' : repo.active_branch.name})
        experiment_logger.log_info({'Commit number : ' : repo.head.object.hexsha})
        experiment_logger.log_info(vars(args))

    window_size = 9
    step_size = 2
    agent_width = 10
    obs_width = 10
    grid_size = 10

    feat_ext = None
    # initialize the feature extractor to be used
    if args.feat_extractor == "Onehot":
        feat_ext = OneHot(grid_rows=10, grid_cols=10)
    if args.feat_extractor == "SocialNav":
        feat_ext = SocialNav(fieldList=["agent_state", "goal_state"])
    if args.feat_extractor == "FrontBackSideSimple":
        feat_ext = FrontBackSideSimple(
            thresh1=1,
            thresh2=2,
            thresh3=3,
            thresh4=4,
            step_size=step_size,
            agent_width=agent_width,
            obs_width=obs_width,
        )

    if args.feat_extractor == "LocalGlobal":
        feat_ext = LocalGlobal(
            window_size=11,
            grid_size=grid_size,
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
        )

    if args.feat_extractor == "DroneFeatureSAM1":

        feat_ext = DroneFeatureSAM1(
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
            grid_size=grid_size,
            thresh1=15,
            thresh2=30,
        )

    if args.feat_extractor == "DroneFeatureOccup":

        feat_ext = DroneFeatureOccup(
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
            grid_size=grid_size,
            window_size=window_size,
        )

    if args.feat_extractor == "DroneFeatureRisk":

        feat_ext = DroneFeatureRisk(
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
            grid_size=grid_size,
            show_agent_persp=False,
            thresh1=15,
            thresh2=30,
        )

    if args.feat_extractor == "DroneFeatureRisk_v2":

        feat_ext = DroneFeatureRisk_v2(
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
            grid_size=grid_size,
            show_agent_persp=False,
            thresh1=15,
            thresh2=30,
        )

    if args.feat_extractor == "DroneFeatureRisk_speed":

        feat_ext = DroneFeatureRisk_speed(
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
            grid_size=grid_size,
            show_agent_persp=False,
            return_tensor=False,
            thresh1=10,
            thresh2=15,
        )

    if args.feat_extractor == "DroneFeatureRisk_speedv2":

        feat_ext = DroneFeatureRisk_speedv2(
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
            grid_size=grid_size,
            show_agent_persp=False,
            return_tensor=False,
            thresh1=18,
            thresh2=30,
        )

    if args.feat_extractor == "VasquezF1":
        feat_ext = VasquezF1(agent_width * 6, 0.5, 1.0)

    if args.feat_extractor == "VasquezF2":
        feat_ext = VasquezF1(agent_width * 6, 0.5, 1.0)

    if args.feat_extractor == "VasquezF3":
        feat_ext = VasquezF3(agent_width)

    if args.feat_extractor == "Fahad":
        feat_ext = Fahad(36, 60, 0.5, 1.0)

    if args.feat_extractor == "GoalConditionedFahad":
        feat_ext = GoalConditionedFahad(36, 60, 0.5, 1.0)

    if feat_ext is None:
        print("Please enter proper feature extractor!")
        exit()
    # log feature extractor info

    if not args.dont_save and not args.play:

        experiment_logger.log_header("Parameters of the feature extractor :")
        experiment_logger.log_info(feat_ext.__dict__)

    # initialize the environment
    if args.replace_subject:
        replace_subject = True
    else:
        replace_subject = False

    env = GridWorldDrone(
        display=args.render,
        is_onehot=False,
        seed=args.seed,
        obstacles=None,
        show_trail=False,
        is_random=True,
        annotation_file=args.annotation_file,
        subject=args.subject,
        tick_speed=60,
        obs_width=10,
        step_size=step_size,
        agent_width=agent_width,
        replace_subject=replace_subject,
        segment_size=args.segment_size,
        external_control=True,
        step_reward=0.001,
        show_comparison=True,
        consider_heading=True,
        show_orientation=True,
        # rows=200, cols=200, width=grid_size)
        rows=576,
        cols=720,
        width=grid_size,
    )

    # env = gym.make('Acrobot-v1')
    # log environment info
    if not args.dont_save and not args.play:

        experiment_logger.log_header("Environment details :")
        experiment_logger.log_info(env.__dict__)

    # initialize RL

    if args.rl_method == "ActorCritic":
        model = ActorCritic(
            env,
            feat_extractor=feat_ext,
            gamma=1,
            log_interval=100,
            max_episode_length=args.max_ep_length,
            hidden_dims=args.policy_net_hidden_dims,
            save_folder=save_folder,
            lr=args.lr,
            entropy_coeff=args.entropy_coeff,
            max_episodes=args.total_episodes,
        )

    if args.rl_method == "SAC":

        replay_buffer = ReplayBuffer(args.replay_buffer_size)

        model = SoftActorCritic(
            env,
            replay_buffer,
            feat_ext,
            buffer_sample_size=args.replay_buffer_sample_size,
            entropy_tuning=True,
            play_interval=args.play_interval,
            entropy_target=args.entropy_target,
            gamma=args.gamma,
            learning_rate=args.lr,
        )

    if args.rl_method == "discrete_QSAC":

        replay_buffer = ReplayBuffer(args.replay_buffer_size)

        model = QSoftActorCritic(
            env,
            replay_buffer,
            feat_ext,
            buffer_sample_size=args.replay_buffer_sample_size,
            entropy_tuning=True,
            play_interval=args.play_interval,
            entropy_target=args.entropy_target,
            gamma=args.gamma,
            learning_rate=args.lr,
        )
    # log RL info
    if not args.dont_save and not args.play:

        experiment_logger.log_header("Details of the RL method :")
        experiment_logger.log_info(model.__dict__)

    if args.policy_path is not None:

        from debugtools import numericalSort

        policy_file_list = []
        reward_across_models = []
        # print(args.policy_path)
        if os.path.isfile(args.policy_path):
            policy_file_list.append(args.policy_path)
        if os.path.isdir(args.policy_path):
            policy_names = glob.glob(os.path.join(args.policy_path, "*.pt"))
            policy_file_list = sorted(policy_names, key=numericalSort)

        xaxis = np.arange(len(policy_file_list))

    if not args.play and not args.play_user:
        # no playing of any kind, so training

        if args.reward_path is None:

            if args.policy_path:
                model.policy.load(args.policy_path)

            if args.rl_method == "SAC" or args.rl_method == "discrete_QSAC":
                model.train(args.total_episodes, args.max_ep_length)

            else:
                model.train()

        else:
            from irlmethods.deep_maxent import RewardNet

            state_size = feat_ext.extract_features(env.reset()).shape[0]
            reward_net = RewardNet(state_size, args.reward_net_hidden_dims)
            reward_net.load(args.reward_path)
            print(next(reward_net.parameters()).is_cuda)
            model.train(reward_net=reward_net)

        if not args.dont_save:
            model.policy.save(save_folder + "/policy-models/")

    if args.play:
        # env.tickSpeed = 15
        from debugtools import compile_results

        xaxis = []
        counter = 1
        plt.figure(0)
        avg_reward_list = []
        frac_good_run_list = []
        print(policy_file_list)
        for policy_file in policy_file_list:

            print("Playing for policy :", policy_file)
            model.policy.load(policy_file)
            policy_folder = policy_file.strip().split("/")[0:-2]
            save_folder = ""
            for p in policy_folder:
                save_folder = save_folder + p + "/"

            print("The final save folder ", save_folder)
            # env.tickSpeed = 10
            assert args.policy_path is not None, "pass a policy to play from!"
            if args.exp_trajectory_path is not None:
                from irlmethods.irlUtils import calculate_expert_svf

                expert_svf = calculate_expert_svf(
                    args.exp_trajectory_path,
                    max_time_steps=args.max_ep_length,
                    feature_extractor=feat_ext,
                    gamma=1,
                )
            # reward_across_models.append(model.generate_trajectory(args.num_trajs, args.render))
            if args.exp_trajectory_path is None:

                if args.dont_save:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs, args.render
                    )
                else:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs,
                        args.render,
                        store_raw=args.store_raw_states,
                        path=save_folder + "/agent_generated_trajectories/",
                    )
            else:

                if args.dont_save:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs, args.render, expert_svf=expert_svf
                    )
                else:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs,
                        args.render,
                        path=save_folder + "/agent_generated_trajectories/",
                        expert_svf=expert_svf,
                    )

            avg_reward, good_run_frac = compile_results(
                rewards, state_info, sub_info
            )

            avg_reward_list.append(avg_reward)
            frac_good_run_list.append(good_run_frac)
            plt.plot(avg_reward_list, c="r")
            plt.plot(frac_good_run_list, c="g")
            plt.draw()
        plt.show()

    if args.play_user:
        env.tickSpeed = 200

        model.generate_trajectory_user(
            args.num_trajs, args.render, path="./user_generated_trajectories/"
        )
Beispiel #10
0
def main():

    args = parser.parse_args()
    step_size = 2
    agent_width = 10
    obs_width = 10
    grid_size = 10

    #set up the feature extractor
    from featureExtractor.drone_feature_extractor import DroneFeatureRisk_speedv2
    from featureExtractor.drone_feature_extractor import VasquezF1, VasquezF2, VasquezF3

    feat_ext = None
    if args.feat_extractor == 'DroneFeatureRisk_speedv2':

        feat_ext = DroneFeatureRisk_speedv2(agent_width=agent_width,
                                            obs_width=obs_width,
                                            step_size=step_size,
                                            grid_size=grid_size,
                                            thresh1=18,
                                            thresh2=30)

    if args.feat_extractor == 'VasquezF1':
        feat_ext = VasquezF1(agent_width * 6, 0.5, 1.0)

    if args.feat_extractor == 'VasquezF2':
        feat_ext = VasquezF1(agent_width * 6, 0.5, 1.0)

    if args.feat_extractor == 'VasquezF3':
        feat_ext = VasquezF3(agent_width)

    #set up the environment
    from envs.gridworld_drone import GridWorldDrone

    env = GridWorldDrone(
        display=True,
        is_onehot=False,
        obstacles=None,
        show_trail=False,
        is_random=True,
        annotation_file=args.annotation_file,
        tick_speed=60,
        obs_width=10,
        step_size=step_size,
        agent_width=agent_width,
        replace_subject=False,
        consider_heading=True,
        show_orientation=True,
        rows=576,
        cols=720,
        width=grid_size,
    )

    #set up the policy network
    from rlmethods.b_actor_critic import Policy
    state_size = feat_ext.extract_features(env.reset()).shape[0]
    policy_net = Policy(state_size, env.action_space.n,
                        args.policy_net_hidden_dims)
    policy_net.load(args.policy_path)
    print(next(policy_net.parameters()).is_cuda)

    #set up the reward network
    from irlmethods.deep_maxent import RewardNet

    state_size = feat_ext.extract_features(env.reset()).shape[0]
    reward_net = RewardNet(state_size, args.reward_net_hidden_dims)
    reward_net.load(args.reward_path)
    print(next(reward_net.parameters()).is_cuda)
    #run stuff
    '''
    screenshot, reward_map = generate_reward_map(env, feat_ext, 
                        reward_net, 
                        render=args.render,
                        sample_rate=args.sample_rate, 
                        frame_id=args.frame_id)

    plot_map(reward_map, frame_img=screenshot)
    '''

    visualize_reward_per_spot(env,
                              feat_ext,
                              reward_net,
                              policy_net,
                              num_traj=20,
                              div=36,
                              render=True)
Beispiel #11
0
def main():

    # initalize summary writer
    tbx_writer = SummaryWriter(comment="_alpha_" + str(args.log_alpha))

    # initialize replay buffer
    replay_buffer = ReplayBuffer(args.replay_buffer_size)

    # initialize feature extractor
    feature_extractor = DroneFeatureRisk_speed(
        agent_width=agent_width,
        obs_width=obs_width,
        step_size=step_size,
        grid_size=grid_size,
        thresh1=18,
        thresh2=30,
    )

    # initialize checkpoint
    if args.checkpoint_path:
        checkpointer = Checkpointer.load_checkpointer(args.checkpoint_path)
    else:
        checkpointer = None

    # initialize environment
    env = GridWorldDrone(
        display=args.render,
        is_random=True,
        rows=576,
        cols=720,
        agent_width=agent_width,
        step_size=step_size,
        obs_width=obs_width,
        width=grid_size,
        annotation_file=args.annotation_file,
        external_control=True,
        continuous_action=True,
        consider_heading=True,
        is_onehot=False,
    )

    # initialize the reward network
    state_size = feature_extractor.extract_features(env.reset()).shape[0]
    reward_net = None
    if args.reward_path is not None:

        reward_net = RewardNet(state_size, args.reward_net_hidden_dims)
        reward_net.load(args.reward_path)

    # intialize the RL method
    soft_ac = SoftActorCritic(
        env,
        replay_buffer,
        feature_extractor,
        buffer_sample_size=args.replay_buffer_sample_size,
        tbx_writer=tbx_writer,
        tau=0.005,
        log_alpha=args.log_alpha,
        entropy_tuning=True,
        entropy_target=args.entropy_target,
        render=args.render,
        play_interval=args.play_interval,
        checkpointer=checkpointer,
    )
    soft_ac.train(
        args.rl_episodes, args.max_episode_length, reward_network=reward_net
    )

    soft_ac.policy.save("./cont_world_policies")
def main():

    #####for the logger
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    ###################

    args = parser.parse_args()

    if args.on_server:

        matplotlib.use('Agg')
        # pygame without monitor
        os.environ['SDL_VIDEODRIVER'] = 'dummy'

    from matplotlib import pyplot as plt
    mp.set_start_method('spawn')

    from rlmethods.scott_SAC.SAC import SAC
    from envs.gridworld_drone import GridWorldDrone
    from featureExtractor.drone_feature_extractor import DroneFeatureSAM1, DroneFeatureOccup, DroneFeatureRisk, DroneFeatureRisk_v2
    from featureExtractor.gridworld_featureExtractor import FrontBackSide, LocalGlobal, OneHot, SocialNav, FrontBackSideSimple
    from featureExtractor.drone_feature_extractor import DroneFeatureRisk_speed

    save_folder = None

    if not args.dont_save and not args.play:

        if not args.save_folder:
            print('Provide save folder.')
            exit()

        policy_net_dims = '-policy_net-'
        for dim in args.policy_net_hidden_dims:
            policy_net_dims += str(dim)
            policy_net_dims += '-'

        reward_net_dims = '-reward_net-'
        for dim in args.reward_net_hidden_dims:
            reward_net_dims += str(dim)
            reward_net_dims += '-'

        save_folder = './results/'+ args.save_folder +st + args.feat_extractor + \
                      '-seed-'+str(args.seed) + policy_net_dims + reward_net_dims + \
                      '-total-ep-'+str(args.total_episodes)+'-max-ep-len-'+ str(args.max_ep_length)

        experiment_logger = Logger(save_folder, 'experiment_info.txt')
        experiment_logger.log_header('Arguments for the experiment :')
        experiment_logger.log_info(vars(args))

    window_size = 9
    step_size = 2
    agent_width = 10
    obs_width = 10
    grid_size = 10

    feat_ext = None
    #initialize the feature extractor to be used
    if args.feat_extractor == 'Onehot':
        feat_ext = OneHot(grid_rows=10, grid_cols=10)
    if args.feat_extractor == 'SocialNav':
        feat_ext = SocialNav(fieldList=['agent_state', 'goal_state'])
    if args.feat_extractor == 'FrontBackSideSimple':
        feat_ext = FrontBackSideSimple(
            thresh1=1,
            thresh2=2,
            thresh3=3,
            thresh4=4,
            step_size=step_size,
            agent_width=agent_width,
            obs_width=obs_width,
        )

    if args.feat_extractor == 'LocalGlobal':
        feat_ext = LocalGlobal(
            window_size=11,
            grid_size=grid_size,
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
        )

    if args.feat_extractor == 'DroneFeatureSAM1':

        feat_ext = DroneFeatureSAM1(agent_width=agent_width,
                                    obs_width=obs_width,
                                    step_size=step_size,
                                    grid_size=grid_size,
                                    thresh1=15,
                                    thresh2=30)

    if args.feat_extractor == 'DroneFeatureOccup':

        feat_ext = DroneFeatureOccup(agent_width=agent_width,
                                     obs_width=obs_width,
                                     step_size=step_size,
                                     grid_size=grid_size,
                                     window_size=window_size)

    if args.feat_extractor == 'DroneFeatureRisk':

        feat_ext = DroneFeatureRisk(agent_width=agent_width,
                                    obs_width=obs_width,
                                    step_size=step_size,
                                    grid_size=grid_size,
                                    show_agent_persp=True,
                                    thresh1=15,
                                    thresh2=30)

    if args.feat_extractor == 'DroneFeatureRisk_v2':

        feat_ext = DroneFeatureRisk_v2(agent_width=agent_width,
                                       obs_width=obs_width,
                                       step_size=step_size,
                                       grid_size=grid_size,
                                       show_agent_persp=True,
                                       thresh1=15,
                                       thresh2=30)

    if args.feat_extractor == 'DroneFeatureRisk_speed':

        feat_ext = DroneFeatureRisk_speed(agent_width=agent_width,
                                          obs_width=obs_width,
                                          step_size=step_size,
                                          grid_size=grid_size,
                                          show_agent_persp=False,
                                          thresh1=10,
                                          thresh2=15)

    if feat_ext is None:
        print('Please enter proper feature extractor!')
        exit()
    #log feature extractor info

    if not args.dont_save and not args.play:

        experiment_logger.log_header('Parameters of the feature extractor :')
        experiment_logger.log_info(feat_ext.__dict__)

    #initialize the environment
    if args.replace_subject:
        replace_subject = True
    else:
        replace_subject = False

    env = GridWorldDrone(
        display=args.render,
        is_onehot=False,
        seed=args.seed,
        obstacles=None,
        show_trail=False,
        is_random=True,
        annotation_file=args.annotation_file,
        subject=args.subject,
        tick_speed=60,
        obs_width=10,
        step_size=step_size,
        agent_width=agent_width,
        replace_subject=replace_subject,
        segment_size=args.segment_size,
        external_control=True,
        step_reward=0.001,
        show_comparison=True,
        consider_heading=True,
        show_orientation=True,

        #rows=200, cols=300, width=grid_size)
        rows=576,
        cols=720,
        width=grid_size)

    #log environment info
    if not args.dont_save and not args.play:

        experiment_logger.log_header('Environment details :')
        experiment_logger.log_info(env.__dict__)

    #initialize RL
    model = SAC(env,
                feat_extractor=feat_ext,
                log_interval=100,
                max_ep_length=args.max_ep_length,
                hidden_dims=args.policy_net_hidden_dims,
                save_folder=save_folder,
                max_episodes=args.total_episodes)

    #log RL info
    if not args.dont_save and not args.play:

        experiment_logger.log_header('Details of the RL method :')
        experiment_logger.log_info(model.__dict__)

    if args.policy_path is not None:

        from debugtools import numericalSort
        policy_file_list = []
        reward_across_models = []
        if os.path.isfile(args.policy_path):
            policy_file_list.append(args.policy_path)
        if os.path.isdir(args.policy_path):
            policy_names = glob.glob(os.path.join(args.policy_path, '*.pt'))
            policy_file_list = sorted(policy_names, key=numericalSort)

        xaxis = np.arange(len(policy_file_list))

    if not args.play and not args.play_user:
        #no playing of any kind, so training

        if args.reward_path is None:
            if args.policy_path:
                model.policy.load(args.policy_path)
            model.train()
        else:
            from irlmethods.deep_maxent import RewardNet
            state_size = feat_ext.extract_features(env.reset()).shape[0]
            reward_net = RewardNet(state_size, args.reward_net_hidden_dims)
            reward_net.load(args.reward_path)
            print(next(reward_net.parameters()).is_cuda)
            model.train(reward_net=reward_net)

        if not args.dont_save:
            model.policy.save(save_folder + '/policy-models/')

    if args.play:
        #env.tickSpeed = 15
        from debugtools import compile_results
        xaxis = []
        counter = 1
        plt.figure(0)
        avg_reward_list = []
        frac_good_run_list = []
        for policy_file in policy_file_list:

            print('Playing for policy :', policy_file)
            model.policy.load(policy_file)
            policy_folder = policy_file.strip().split('/')[0:-2]
            save_folder = ''
            for p in policy_folder:
                save_folder = save_folder + p + '/'

            print('The final save folder ', save_folder)
            #env.tickSpeed = 10
            assert args.policy_path is not None, 'pass a policy to play from!'
            if args.exp_trajectory_path is not None:
                from irlmethods.irlUtils import calculate_expert_svf
                expert_svf = calculate_expert_svf(
                    args.exp_trajectory_path,
                    max_time_steps=args.max_ep_length,
                    feature_extractor=feat_ext,
                    gamma=1)
            #reward_across_models.append(model.generate_trajectory(args.num_trajs, args.render))
            if args.exp_trajectory_path is None:

                if args.dont_save:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs, args.render)
                else:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs,
                        args.render,
                        path=save_folder + '/agent_generated_trajectories/')
            else:

                if args.dont_save:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs, args.render, expert_svf=expert_svf)
                else:
                    rewards, state_info, sub_info = model.generate_trajectory(
                        args.num_trajs,
                        args.render,
                        path=save_folder + '/agent_generated_trajectories/',
                        expert_svf=expert_svf)

            avg_reward, good_run_frac = compile_results(
                rewards, state_info, sub_info)
            #pdb.set_trace()
            avg_reward_list.append(avg_reward)
            frac_good_run_list.append(good_run_frac)
            plt.plot(avg_reward_list, c='r')
            plt.plot(frac_good_run_list, c='g')
            plt.draw()
        plt.show()

    if args.play_user:
        env.tickSpeed = 200

        model.generate_trajectory_user(args.num_trajs,
                                       args.render,
                                       path='./user_generated_trajectories/')
Beispiel #13
0
def main():

    args = parser.parse_args()

    utils.seed_all(args.seed)

    if args.on_server:
        # matplotlib without monitor
        matplotlib.use('Agg')

        # pygame without monitor
        os.environ['SDL_VIDEODRIVER'] = 'dummy'
    from matplotlib import pyplot as plt

    save_folder = None
    if not args.dont_save:
        save_folder = './results/'+ args.save_folder
        experiment_logger = Logger(save_folder,'experiment_info.txt')

        experiment_logger.log_header('Arguments for the experiment :')
        experiment_logger.log_info(vars(args))
    

    mp.set_start_method('spawn')

    if args.render:
        from envs.gridworld import GridWorld
    else:
        from envs.gridworld_clockless import GridWorldClockless as GridWorld
        

    if args.feat_extractor=='MCFeatures':
        feat_ext = MCFeatures(args.state_discretization[0], args.state_discretization[1]) 

    elif args.feat_extractor=='MCFeaturesOnehot':
        feat_ext = MCFeaturesOnehot(args.state_discretization[0], args.state_discretization[1])

    else:
        print('Enter proper feature extractor value.')
        exit()

    if not args.dont_save:
        experiment_logger.log_header('Parameters of the feature extractor :')
        experiment_logger.log_info(feat_ext.__dict__)

    '''
    np.asarray([2,2]),np.asarray([7,4]),np.asarray([3,5]),
                                np.asarray([5,2]),np.asarray([8,3]),np.asarray([7,5]),
                                np.asarray([3,3]),np.asarray([3,7]),np.asarray([5,7])
                                
    env = GridWorld(display=args.render, is_onehot= False,is_random=True,
                    rows=100, agent_width=agent_width,step_size=step_size,
                    obs_width=obs_width,width=grid_size,
                    cols=100,
                    seed=7,
                    buffer_from_obs=0,
                    obstacles=3,
                                
                    goal_state=np.asarray([5,5]))
    '''
    env = gym.make('MountainCar-v0')
    env = env.unwrapped

    if not args.dont_save:

        experiment_logger.log_header('Environment details :')
        experiment_logger.log_info(env.__dict__)


    model = ActorCritic(env, feat_extractor=feat_ext,  gamma=0.99, plot_loss=False,
                        log_interval=10, max_ep_length=300, hidden_dims=args.policy_net_hidden_dims,
                        max_episodes=30, save_folder=save_folder)

    if not args.dont_save:

        experiment_logger.log_header('Details of the RL method :')
        experiment_logger.log_info(model.__dict__)
    
    #pdb.set_trace()

    if args.policy_path is not None:
        policy_file_list =  []
        reward_across_models = []
        if os.path.isfile(args.policy_path):
            policy_file_list.append(args.policy_path)
        if os.path.isdir(args.policy_path):
            policy_names = glob.glob(os.path.join(args.policy_path, '*.pt'))
            policy_file_list = sorted(policy_names, key=numericalSort)
        
        xaxis = np.arange(len(policy_file_list))

    if not args.play and not args.play_user:
        if args.reward_path is None:
            model.train_mp(n_jobs=4)
        else:

            from irlmethods.deep_maxent import RewardNet
            state_size = feat_ext.state_rep_size
            reward_net = RewardNet(state_size, args.policy_net_hidden_dims)
            reward_net.load(args.reward_path)
            print(next(reward_net.parameters()).is_cuda)
            model.train_mp(reward_net = reward_net,n_jobs = 4)

        if not args.dont_save:  
            model.policy.save(save_folder+'/policy/')

    if args.play:
        xaxis = []
        counter = 1
        print(policy_file_list)
        for policy_file in policy_file_list:

            model.policy.load(policy_file)

            env.tickSpeed = 15
            assert args.policy_path is not None, 'pass a policy to play from!'

            reward_across_models.append(model.generate_trajectory(args.num_trajs, args.render))

        #plotting the 2d list

            xaxis.append(counter)
            counter += 1
            reward_across_models_np = np.array(reward_across_models)
            mean_rewards = np.mean(reward_across_models_np, axis=1)
            std_rewards = np.std(reward_across_models_np, axis=1)
            plt.plot(xaxis,mean_rewards,color = 'r',label='IRL trained agent')
            plt.fill_between(xaxis , mean_rewards-std_rewards , 
                        mean_rewards+std_rewards, alpha = 0.5, facecolor = 'r')
            plt.draw()
            plt.pause(0.001)
            '''
            print('RAM usage :')
            display_memory_usage(process.memory_info().rss)
            print('GPU usage :')
            display_memory_usage(torch.cuda.memory_allocated())
            torch.cuda.empty_cache()
            display_memory_usage(torch.cuda.memory_allocated())
            '''
            #plt.show()
        plt.show()
    if args.play_user:
        env.tickSpeed = 200

        model.generate_trajectory_user(args.num_trajs, './trajs/ac_gridworld_user/')
Beispiel #14
0
def main():

    args = parser.parse_args()
    mp.set_start_method('spawn')

    from envs.gridworld_drone import GridWorldDrone

    agent_width = 10
    step_size = 2
    obs_width = 10
    grid_size = 10

    if args.feat_extractor == 'Onehot':
        feat_ext = OneHot(grid_rows=10, grid_cols=10)
    if args.feat_extractor == 'SocialNav':
        feat_ext = SocialNav(fieldList=['agent_state', 'goal_state'])
    if args.feat_extractor == 'FrontBackSideSimple':
        feat_ext = FrontBackSideSimple(
            thresh1=1,
            thresh2=2,
            thresh3=3,
            thresh4=4,
            step_size=step_size,
            agent_width=agent_width,
            obs_width=obs_width,
            fieldList=['agent_state', 'goal_state', 'obstacles'])

    if args.feat_extractor == 'LocalGlobal':
        feat_ext = LocalGlobal(
            window_size=3,
            grid_size=grid_size,
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
            fieldList=['agent_state', 'goal_state', 'obstacles'])

    #featExtract = OneHot(grid_rows=10,grid_cols=10)
    #featExtract = FrontBackSideSimple(thresh1 = 1,fieldList =  ['agent_state','goal_state','obstacles'])

    #featExtract = SocialNav(fieldList = ['agent_state','goal_state'])
    '''
    np.asarray([2,2]),np.asarray([7,4]),np.asarray([3,5]),
                                np.asarray([5,2]),np.asarray([8,3]),np.asarray([7,5]),
                                np.asarray([3,3]),np.asarray([3,7]),np.asarray([5,7])
                               
    env = GridWorld(display=args.render, is_onehot= False,is_random=True,
                    rows=10, agent_width=agent_width,step_size=step_size,
                    obs_width=obs_width,width=grid_size,
                    cols=10,
                    seed = 7,
                    obstacles = '../envs/map3.jpg',
                                
                    goal_state = np.asarray([5,5]))
    '''

    env = GridWorldDrone(display=args.render,
                         is_onehot=False,
                         seed=999,
                         obstacles=None,
                         show_trail=False,
                         is_random=False,
                         annotation_file=args.annotation_file,
                         subject=None,
                         tick_speed=90,
                         obs_width=10,
                         step_size=step_size,
                         agent_width=agent_width,
                         show_comparison=True,
                         rows=576,
                         cols=720,
                         width=grid_size)

    model = ActorCritic(env,
                        feat_extractor=featExtract,
                        gamma=0.99,
                        log_interval=50,
                        max_ep_length=500,
                        max_episodes=2000)

    if args.policy_path is not None:
        model.policy.load(args.policy_path)

    if not args.play and not args.play_user:
        if args.reward_path is None:
            model.train_mp(n_jobs=4)
        else:
            from irlmethods.deep_maxent import RewardNet
            state_size = featExtract.extract_features(env.reset()).shape[0]
            reward_net = RewardNet(state_size)
            reward_net.load(args.reward_path)
            print(next(reward_net.parameters()).is_cuda)
            model.train_mp(reward_net=reward_net, n_jobs=4)

        if not args.dont_save:
            model.policy.save('./saved-models/')

    if args.play:
        #env.tickSpeed = 15
        assert args.policy_path is not None, 'pass a policy to play from!'

        model.generate_trajectory(
            args.num_trajs, './trajs/ac_loc_glob_rectified_win_3_static_map3/')

    if args.play_user:
        env.tickSpeed = 200

        model.generate_trajectory_user(args.num_trajs,
                                       './trajs/ac_gridworld_user/')
Beispiel #15
0
def plot_true_and_network_reward(reward_network_folder, feature_extractor):

    env = gym.make('MountainCar-v0')
    reward_network_names = glob.glob(
        os.path.join(reward_network_folder, '*.pt'))

    exhaustive_state_list, true_reward = get_exhaustive_state_list(
        feature_extractor)
    reward_holder = np.zeros(
        [len(reward_network_names) + 1,
         len(exhaustive_state_list)])

    hidden_dims = [1024, 256]
    net_counter = 0
    for reward_net in sorted(reward_network_names, key=numericalSort):

        reward_net_model = RewardNet(feature_extractor.state_rep_size,
                                     hidden_dims)
        print("loading reward_net :", reward_net)
        reward_net_model.load(reward_net)
        reward_net_model.eval()
        reward_net_model.to('cuda')

        for j in range(len(exhaustive_state_list)):

            state = exhaustive_state_list[j]
            reward = reward_net_model(state)
            reward_holder[net_counter][j] = reward

        net_counter += 1

    reward_holder[-1][:] = np.array(true_reward)
    pdb.set_trace()

    ##################for visualizing the rewards###############

    conv_arr = np.array([2**i for i in range(7, -1, -1)])
    conv_arr_vel = np.array([2**i for i in range(3, -1, -1)])
    print(conv_arr_vel)
    reward_mat = np.zeros((128, 8))
    for i in range(reward_holder.shape[0] - 1):
        state_arr = np.zeros(128)
        fig = plt.figure()
        fig2 = plt.figure()
        ax = Axes3D(fig)
        ax2 = fig2.add_subplot(111)
        lx = reward_mat.shape[0]
        ly = reward_mat.shape[1]

        xpos = np.arange(0, lx, 1)
        ypos = np.arange(0, ly, 1)

        xpos, ypos = np.meshgrid(xpos, ypos)
        xpos = xpos.flatten()  # Convert positions to 1D array
        ypos = ypos.flatten()
        zpos = np.zeros(lx * ly)

        dx = 0.5 * np.ones_like(zpos)
        dy = dx.copy()

        #cs = ['r', 'g', 'b', 'y', 'c'] * ly
        for j in range(reward_holder.shape[1]):

            state = exhaustive_state_list[j].cpu().numpy()
            pos = state[0:8]
            vel = state[8:]
            print('pos', pos)
            print('vel', vel)
            print(conv_arr)
            reward_mat[int(pos.dot(conv_arr)) -
                       1][int(vel.dot(conv_arr_vel) - 1)] = reward_holder[i][j]

        ax.bar3d(xpos, ypos, zpos, dx, dy, reward_mat.flatten())
        cax = ax2.matshow(reward_mat, interpolation='nearest')
        fig2.colorbar(cax)
        plt.show()
    #print(reward_holder[:,0:200])
    plt.pcolor(reward_holder)
    #plt.matshow(reward_holder[:,:])
    plt.show()
    return reward_holder