# update model based module action_np_vec = np.zeros([1,6]) action_np_vec[0,action-1] = 1. action_vec = torch.from_numpy(action_np_vec).float().cuda() current_state_action = torch.cat([state,action_vec],1) BBN_dynamic.train(current_state_action, next_state) # info gain hyperparameters = BBN_dynamic.dump_hyparameters() info_gain = BBN_dynamic.get_info_gain(hyperparameters, pre_hyperparameters) # Store the transition in memory agent.store_transition(state, action-1, reward+info_gain*ratio*(1-epoch/epochs)) print('epoch: %d, image: %d, step: %d, reward: %d' %(epoch ,i, step, reward)) # Move to the next state state = next_state # Perform the optimization if done: print("updating model !") agent.REINFORCE() print("finish updating model !") break
else: offset, region_image, size_mask, region_mask = get_crop_image_and_mask( original_shape, offset, region_image, size_mask, action) # update history vector and get next state history_vector = update_history_vector(history_vector, action) next_state = get_state(region_image, history_vector, model_vgg) # find the max bounding box in the region image new_iou = find_max_bounding_box(gt_masks, region_mask, classes_gt_objects, CLASS_OBJECT) reward = get_reward_movement(iou, new_iou) iou = new_iou # Store the transition in memory agent.store_transition(state, action - 1, reward) print('epoch: %d, image: %d, step: %d, reward: %d' % (epoch, i, step, reward)) # Move to the next state state = next_state # Perform the optimization if done: print("updating model !") agent.REINFORCE() print("finish updating model !") break #==================== loop of training procedure ==========================================#
TUC_dynamic.train_enc_dec(state, next_state, action_vec) if step > 0: mean, std = TUC_dynamic.dump_z_mean_std(state, action_vec) intrinsic_reward = TUC_dynamic.dump_exploration_reward( pre_mean, pre_std, mean, std) if i > 0: penalty = TUC_dynamic.dump_regret(state, action - 1) #print(intrinsic_reward,penalty) #print(ratio_1*((epochs-epoch)/epochs)*intrinsic_reward - ratio_2*(epoch/epochs)*penalty) # Store the transition in memory agent.store_transition( state, action - 1, reward + ratio_1 * (epochs - epoch / epochs) * intrinsic_reward - ratio_2 * (epoch / epochs) * penalty) print('epoch: %d, image: %d, step: %d, reward: %d' % (epoch, i, step, reward)) # Move to the next state state = next_state # Perform the optimization if done: states = torch.cat(agent.states) values = torch.tensor(np.expand_dims(np.array( agent.get_values()), axis=1),