rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for action, r in zip(model.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(model.saved_actions, [None for _ in model.saved_actions]) optimizer.step() del model.rewards[:] del model.saved_actions[:] # Training: env = SenseEnv(vars(args)) print("action space: ",env.action_space()) print("class count: ",env.classification_n()) model = Policy(env.observation_space(),env.action_space_n()) cnn_lstm = CNNLSTM(env.classification_n()) if args.gpu and torch.cuda.is_available(): model.cuda() cnn_lstm.cuda() if model_path: if os.path.exists(model_path+"/model.pkl"): print("loading pretrained models") model.load_state_dict(torch.load(model_path+"/model.pkl")) cnn_lstm.load_state_dict(torch.load(model_path+"/cnn_lstm.pkl")) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) classifier_criterion = nn.CrossEntropyLoss()
reward = r - value.data[0,0] action.reinforce(reward) value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]))) optimizer.zero_grad() final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions)) gradients = [torch.ones(1)] + [None] * len(saved_actions) autograd.backward(final_nodes, gradients) optimizer.step() del model.rewards[:] del model.saved_actions[:] #train env = SenseEnv(vars(args)) print("action space: ",env.action_space()) model = Policy(env.observation_space(),env.action_space_n()) cnn = CNN(env.classification_n()) if args.gpu and torch.cuda.is_available(): model.cuda() cnn.cuda() if args.model_path: if os.path.exists(args.model_path+"/model.pkl"): print("loading pretrained models") model.load_state_dict(torch.load(args.model_path+"/model.pkl")) cnn.load_state_dict(torch.load(args.model_path+"/cnn.pkl")) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) classifier_criterion = nn.CrossEntropyLoss() classifier_optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)
np.finfo(np.float32).eps) for action, r in zip(model.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(model.saved_actions, [None for _ in model.saved_actions]) optimizer.step() del model.rewards[:] del model.saved_actions[:] # Training: env = SenseEnv(vars(args)) print("action space: ", env.action_space()) model = Policy(env.observation_space(), env.action_space_n()) cnn_lstm = CNNLSTM(env.classification_n()) if args.gpu and torch.cuda.is_available(): model.cuda() cnn_lstm.cuda() if args.model_path: if os.path.exists(args.model_path + "/model.pkl"): print("loading pretrained models") model.load_state_dict(torch.load(args.model_path + "/model.pkl")) cnn_lstm.load_state_dict(torch.load(args.model_path + "/cnn_lstm.pkl")) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) classifier_criterion = nn.CrossEntropyLoss() classifier_optimizer = torch.optim.Adam(cnn_lstm.parameters(), lr=0.001)