rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for (action, value), r in zip(saved_actions, rewards): reward = r - value.data[0,0] action.reinforce(reward) value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]))) optimizer.zero_grad() final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions)) gradients = [torch.ones(1)] + [None] * len(saved_actions) autograd.backward(final_nodes, gradients) optimizer.step() del model.rewards[:] del model.saved_actions[:] #train env = SenseEnv(vars(args)) print("action space: ",env.action_space()) model = Policy(env.observation_space(),env.action_space_n()) cnn = CNN(env.classification_n()) if args.gpu and torch.cuda.is_available(): model.cuda() cnn.cuda() if args.model_path: if os.path.exists(args.model_path+"/model.pkl"): print("loading pretrained models") model.load_state_dict(torch.load(args.model_path+"/model.pkl")) cnn.load_state_dict(torch.load(args.model_path+"/cnn.pkl")) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
'moves': 5, 'action': 1 }, { 'moves': 10, 'action': 3 }, { 'moves': 15, 'action': 4 }, { 'moves': 10, 'action': 3 }] #top of pyramid if len(sys.argv) == 2: obj_file = sys.argv[1] print("loading", obj_file) env = SenseEnv({'render': True, 'debug': True, 'obj_path': obj_file}) action_plan_counter = 0 action_step = 0 def max_action_plan_steps(action_plan): return sum([x['moves'] for x in action_plan]) def process_action_plan(action_plan, action_plan_counter=0, action_step=0): max_steps = max_action_plan_steps(action_plan) if action_step == 0: start_for_current_action = 0 else: start_for_current_action = sum( [x['moves'] for x in action_plan[:action_step]])
R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for action, r in zip(model.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(model.saved_actions, [None for _ in model.saved_actions]) optimizer.step() del model.rewards[:] del model.saved_actions[:] # Training: env = SenseEnv(vars(args)) print("action space: ",env.action_space()) print("class count: ",env.classification_n()) model = Policy(env.observation_space(),env.action_space_n()) cnn_lstm = CNNLSTM(env.classification_n()) if args.gpu and torch.cuda.is_available(): model.cuda() cnn_lstm.cuda() if model_path: if os.path.exists(model_path+"/model.pkl"): print("loading pretrained models") model.load_state_dict(torch.load(model_path+"/model.pkl")) cnn_lstm.load_state_dict(torch.load(model_path+"/cnn_lstm.pkl")) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
type=str, help='log experiment to tensorboard') parser.add_argument('--model_path', type=str, help='path to store/retrieve model at') parser.add_argument('--mode', type=str, default="train", help='train/test/all model') parser.add_argument('--data_path', type=str, default="../../touchable_data/objects/") args = parser.parse_args() env = SenseEnv(vars(args)) num_games = 20 game_length = 1000 e_greedy_inc = 0.05 / game_length # want to increase by 0.05 per game so we spend enough time exploring mem_size = num_games * game_length cnn_features_TD = np.zeros((num_games, 40000), dtype=np.int8) cnn_labels_TD = np.zeros(num_games, dtype=np.int8) cnn_features_ED = np.zeros((num_games, 40000), dtype=np.int8) cnn_labels_ED = np.zeros(num_games, dtype=np.int8) TD_cnt = 0 # counter to keep track of how many times we touch in the training phase