コード例 #1
0
    print(sorted_returns)

    # Now we download a pretrained network to form \phi(s) the state features where the reward is now w^T \phi(s)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    reward_net = EmbeddingNet(args.encoding_dims)
    reward_net.load_state_dict(
        torch.load(args.pretrained_network, map_location=device))
    #reinitialize last layer
    num_features = reward_net.fc2.in_features

    print("reward is linear combination of ", num_features, "features")
    reward_net.fc2 = nn.Linear(
        num_features, 1,
        bias=False)  #last layer just outputs the scalar reward = w^T \phi(s)
    reward_net.to(device)
    #freeze all weights so there are no gradients (we'll manually update the last layer via proposals so no grads required)
    for param in reward_net.parameters():
        param.requires_grad = False

    #get num_demos by num_features + 1 (bias) numpy array with (un-discounted) feature counts from pretrained network
    demo_cnts = generate_feature_counts(demonstrations, reward_net)
    print("demo counts")
    print(demo_cnts)
    if args.plot:
        plotable_cnts = demo_cnts.cpu().numpy()
        import matplotlib.pyplot as plt
        for f in range(num_features):
            #plt.figure(f)
            if plotable_cnts[0, f] < plotable_cnts[-1, f]:  #increasing
                plt.figure(0)
コード例 #2
0
                       })

    env = VecFrameStack(env, 4)
    agent = PPO2Agent(env, env_type, stochastic)

    demonstrations, learning_returns, learning_rewards = generate_mean_map_noop_demos(
        env)

    # Now we download a pretrained network to form \phi(s) the state features where the reward is now w^T \phi(s)
    print("loading policy", args.pretrained_network)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    reward_net = EmbeddingNet(args.encoding_dims)
    reward_net.load_state_dict(
        torch.load(args.pretrained_network, map_location=device))
    #reinitialize last layer
    num_features = reward_net.fc2.in_features

    print("reward is linear combination of ", num_features, "features")
    reward_net.fc2 = nn.Linear(
        num_features, 1,
        bias=False)  #last layer just outputs the scalar reward = w^T \phi(s)
    reward_net.to(device)
    #freeze all weights so there are no gradients (we'll manually update the last layer via proposals so no grads required)
    for param in reward_net.parameters():
        param.requires_grad = False

    #get num_demos by num_features + 1 (bias) numpy array with (un-discounted) feature counts from pretrained network
    demo_cnts = generate_feature_counts([demonstrations], reward_net)
    print("demo counts")
    print(demo_cnts)
    env_name = args.env_name
    #set seeds
    seed = int(args.seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    network_file_loc = args.pretrained_network
    print("Using network at", network_file_loc, "for features.")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    feature_net = EmbeddingNet(args.encoding_dims)
    state_dict = torch.load(network_file_loc, map_location=device)
    print(state_dict.keys())
    feature_net.load_state_dict(
        torch.load(network_file_loc, map_location=device))
    feature_net.to(device)

    print("evaluating", args.checkpointpath)
    print("*" * 10)
    print(env_name)
    print("*" * 10)
    returns, ave_feature_counts, fcounts, num_steps = get_policy_feature_counts(
        env_name, args.checkpointpath, feature_net, args.num_rollouts,
        args.max_length, args.no_op)
    print("returns", returns)
    print("feature counts", ave_feature_counts)
    writer = open(args.fcount_filepath, 'w')
    utils.write_line(ave_feature_counts, writer)
    for fc in fcounts:
        utils.write_line(fc, writer)
    utils.write_line(returns, writer)