def train_data(condition_data, T, dO, dU): pol_info = PolicyInfo({'init_pol_wt': 0.01, 'T': T, 'dU': dU, 'dX': 32}) obs_data, tgt_mu = np.zeros((0, T, dO)), np.zeros((0, T, dU)) tgt_prc, tgt_wt = np.zeros((0, T, dU, dU)), np.zeros((0, T)) # for m in condition_data: # samples, traj = condition_data[m]['samples'], condition_data[m]['traj_distr'] for data in condition_data: samples, traj = data['samples'], data['traj_distr'] X = samples.get_X() N = len(samples) mu = np.zeros((N, T, dU)) prc = np.zeros((N, T, dU, dU)) wt = np.zeros((N, T)) # Get time-indexed actions. for t in range(T): # Compute actions along this trajectory. prc[:, t, :, :] = np.tile(traj.inv_pol_covar[t, :, :], [N, 1, 1]) for i in range(N): mu[i, t, :] = \ (traj.K[t, :, :].dot(X[i, t, :]) + traj.k[t, :]) - \ np.linalg.solve( prc[i, t, :, :] / pol_info.pol_wt[t], pol_info.lambda_K[t, :, :].dot(X[i, t, :]) + \ pol_info.lambda_k[t, :] ) wt[:, t].fill(pol_info.pol_wt[t]) tgt_mu = np.concatenate((tgt_mu, mu)) tgt_prc = np.concatenate((tgt_prc, prc)) tgt_wt = np.concatenate((tgt_wt, wt)) obs_data = np.concatenate((obs_data, samples.get_obs())) return obs_data, tgt_mu, tgt_prc, tgt_wt
def __init__(self, hyperparams): # ALG_BADMM = { # 'inner_iterations': 4, # 'policy_dual_rate': 0.1, # 'policy_dual_rate_covar': 0.0, # 'fixed_lg_step': 0, # 'lg_step_schedule': 10.0, # 'ent_reg_schedule': 0.0, # 'init_pol_wt': 0.01, # 'policy_sample_mode': 'add', # 'exp_step_increase': 2.0, # 'exp_step_decrease': 0.5, # 'exp_step_upper': 0.5, # 'exp_step_lower': 1.0, # } # Copy and update parameters config = copy.deepcopy(ALG_BADMM) # update() adds dictionary dict2's key-values pairs in to dict config.update(hyperparams) # Initialize (algorithm.py) Algorithm.__init__(self, config) # algorithm['policy_prior'] = { # 'type': PolicyPriorGMM, # 'max_clusters': 20, # 'min_samples_per_cluster': 40, # 'max_samples': 40, # } policy_prior = self._hyperparams['policy_prior'] # self._cond_idx = hyperparams['train_conditions'] # self.M = hyperparams['conditions'] = 2 for m in range(self.M): # self.cur = [IterationData() for _ in range(self.M)] # Initialize policy information self.cur[m].pol_info = PolicyInfo(self._hyperparams) self.cur[m].pol_info.policy_prior = \ policy_prior['type'](policy_prior) # algorithm['policy_opt'] = { # 'type': PolicyOptTf, # 'network_params': { # 'obs_include': [JOINT_ANGLES, JOINT_VELOCITIES], # 'obs_vector_data': [JOINT_ANGLES, JOINT_VELOCITIES], # 'sensor_dims': SENSOR_DIMS, # }, # 'network_model': tf_network, # 'iterations': 1000, # 'weights_file_prefix': EXP_DIR + 'policy', # } self.policy_opt = self._hyperparams['policy_opt']['type']( self._hyperparams['policy_opt'], self.dO, self.dU)
def __init__(self, hyperparams): config = copy.deepcopy(ALG_BADMM) config.update(hyperparams) Algorithm.__init__(self, config) policy_prior = self._hyperparams['policy_prior'] for m in range(self.M): self.cur[m].pol_info = PolicyInfo(self._hyperparams) self.cur[m].pol_info.policy_prior = \ policy_prior['type'](policy_prior) self.policy_opt = self._hyperparams['policy_opt']['type']( self._hyperparams['policy_opt'], self.dO, self.dU)
def __init__(self, hyperparams): config = copy.deepcopy(ALG_OLGPS) config.update(hyperparams) Algorithm.__init__(self, config) self.policy_opt = self._hyperparams['policy_opt']['type']( self._hyperparams['policy_opt'], self.dO, self.dU ) self.flag_reset = False policy_prior = self._hyperparams['policy_prior'] for m in range(self.M): self.cur[m].last_pol = PolicyInfo(self._hyperparams) self.cur[m].last_pol.policy_prior = \ policy_prior['type'](policy_prior)
def __init__(self, hyperparams): config = copy.deepcopy(ALG) config.update(hyperparams) self._hyperparams = config if 'train_conditions' in hyperparams: self._cond_idx = hyperparams['train_conditions'] self.M = len(self._cond_idx) else: self.M = hyperparams['conditions'] self._cond_idx = range(self.M) self._hyperparams['train_conditions'] = self._cond_idx self._hyperparams['test_conditions'] = self._cond_idx self.iteration_count = 0 # Grab a few values from the agent. agent = self._hyperparams['agent'] #print(agent) self.agent = agent self.T = self._hyperparams['T'] = agent.T self.dU = self._hyperparams['dU'] = agent.dU self.dX = self._hyperparams['dX'] = agent.dX self.dO = self._hyperparams['dO'] = agent.dO init_traj_distr = config['init_traj_distr'] init_traj_distr['x0'] = agent.x0 init_traj_distr['dX'] = agent.dX init_traj_distr['dU'] = agent.dU del self._hyperparams['agent'] # Don't want to pickle this. # IterationData objects for each condition. self.cur = [IterationData() for _ in range(self.M)] self.prev = [IterationData() for _ in range(self.M)] dynamics = self._hyperparams['dynamics'] for m in range(self.M): self.cur[m].traj_info = TrajectoryInfo() self.cur[m].traj_info.dynamics = dynamics['type'](dynamics) cur_init_traj_distr = extract_condition(init_traj_distr, self._cond_idx[m]) cur_init_traj_distr['cur_cond_idx'] = self._cond_idx[m] #print(cur_init_traj_distr) self.cur[m].traj_distr = cur_init_traj_distr['type']( cur_init_traj_distr, agent) self.traj_opt = hyperparams['traj_opt']['type']( hyperparams['traj_opt']) self.cost = [] for m in range(self.M): cost_hyperparams = hyperparams['cost'].copy() cost_hyperparams['cur_cond_idx'] = self._cond_idx[m] self.cost.append(hyperparams['cost']['type'](cost_hyperparams)) self.base_kl_step = self._hyperparams['kl_step'] policy_prior = self._hyperparams['policy_prior'] for m in range(self.M): self.cur[m].pol_info = PolicyInfo(self._hyperparams) self.cur[m].pol_info.policy_prior = \ policy_prior['type'](policy_prior) self.policy_opt = self._hyperparams['policy_opt']['type']( self._hyperparams['policy_opt'], self.dO, self.dU)
def re_init_pol_info(self, hyperparams): policy_prior = self._hyperparams['policy_prior'] for m in range(self.M): self.cur[m].pol_info = PolicyInfo(self._hyperparams) self.cur[m].pol_info.policy_prior = \ policy_prior['type'](policy_prior)