print(sorted_returns) # Now we download a pretrained network to form \phi(s) the state features where the reward is now w^T \phi(s) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") reward_net = EmbeddingNet(args.encoding_dims) reward_net.load_state_dict( torch.load(args.pretrained_network, map_location=device)) #reinitialize last layer num_features = reward_net.fc2.in_features print("reward is linear combination of ", num_features, "features") reward_net.fc2 = nn.Linear( num_features, 1, bias=False) #last layer just outputs the scalar reward = w^T \phi(s) reward_net.to(device) #freeze all weights so there are no gradients (we'll manually update the last layer via proposals so no grads required) for param in reward_net.parameters(): param.requires_grad = False #get num_demos by num_features + 1 (bias) numpy array with (un-discounted) feature counts from pretrained network demo_cnts = generate_feature_counts(demonstrations, reward_net) print("demo counts") print(demo_cnts) if args.plot: plotable_cnts = demo_cnts.cpu().numpy() import matplotlib.pyplot as plt for f in range(num_features): #plt.figure(f) if plotable_cnts[0, f] < plotable_cnts[-1, f]: #increasing plt.figure(0)
}) env = VecFrameStack(env, 4) agent = PPO2Agent(env, env_type, stochastic) demonstrations, learning_returns, learning_rewards = generate_mean_map_noop_demos( env) # Now we download a pretrained network to form \phi(s) the state features where the reward is now w^T \phi(s) print("loading policy", args.pretrained_network) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") reward_net = EmbeddingNet(args.encoding_dims) reward_net.load_state_dict( torch.load(args.pretrained_network, map_location=device)) #reinitialize last layer num_features = reward_net.fc2.in_features print("reward is linear combination of ", num_features, "features") reward_net.fc2 = nn.Linear( num_features, 1, bias=False) #last layer just outputs the scalar reward = w^T \phi(s) reward_net.to(device) #freeze all weights so there are no gradients (we'll manually update the last layer via proposals so no grads required) for param in reward_net.parameters(): param.requires_grad = False #get num_demos by num_features + 1 (bias) numpy array with (un-discounted) feature counts from pretrained network demo_cnts = generate_feature_counts([demonstrations], reward_net) print("demo counts") print(demo_cnts)
env_name = args.env_name #set seeds seed = int(args.seed) torch.manual_seed(seed) np.random.seed(seed) tf.set_random_seed(seed) network_file_loc = args.pretrained_network print("Using network at", network_file_loc, "for features.") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") feature_net = EmbeddingNet(args.encoding_dims) state_dict = torch.load(network_file_loc, map_location=device) print(state_dict.keys()) feature_net.load_state_dict( torch.load(network_file_loc, map_location=device)) feature_net.to(device) print("evaluating", args.checkpointpath) print("*" * 10) print(env_name) print("*" * 10) returns, ave_feature_counts, fcounts, num_steps = get_policy_feature_counts( env_name, args.checkpointpath, feature_net, args.num_rollouts, args.max_length, args.no_op) print("returns", returns) print("feature counts", ave_feature_counts) writer = open(args.fcount_filepath, 'w') utils.write_line(ave_feature_counts, writer) for fc in fcounts: utils.write_line(fc, writer) utils.write_line(returns, writer)