def run( self, policy, use_local_stats=False, # Update task prop only with local stats parallel=True, filename=generate_filename(), verbose=False): self.use_local_stats = True self.initial_configuration = self.get_param_list(locals()) self.estimator = Estimators(self.task_prop, self.constr) N = N_old = self.constr.N_min # Total number of trajectories to take in this iteration N1, N2, N3 = self.split_trajectory_count(N) #Multiprocessing preparation if parallel: self._enable_parallel() # COMPUTE BASELINES features, actions, rewards, prevJ, gradients = self.get_trajectories_data( policy, N, parallel=parallel) #prevJ_det = self.estimate_policy_performance(policy, N, parallel=parallel, deterministic=True, get_min=False) prevJ_det = prevJ #Learning iteration = 0 N_tot = 0 start_time = time.time() J_hat = prevJ J_journey = (2 * prevJ + prevJ_det) / 3 policy_low_variance = policy while iteration < self.constr.max_iter: iteration += 1 J_det_exact = self.env.computeJ(policy.theta_mat, 0) self.make_checkpoint(locals()) # CHECKPOINT BEFORE SIGMA STEP J_journey = 0 # PRINT if verbose: if iteration % 50 == 1: print( 'IT\tN\t\tJ\t\t\tJ_DET\t\t\tTHETA\t\tSIGMA\t\t\tBUDGET' ) print(iteration, '\t', N, '\t', J_hat, '\t', prevJ_det, '\t', policy.get_theta(), '\t', policy.sigma, '\t', self.budget / N, '\t', time.time() - start_time) start_time = time.time() # PERFORM FIRST STEP features, actions, rewards, J_hat, gradients = self.get_trajectories_data( policy, N1, parallel=parallel) J_journey += J_hat * N1 if iteration > 1: self.budget += N3 * ( J_hat - prevJ) # B += J(theta, sigma') - J(theta, sigma) prevJ = J_hat alpha, N1, safe = self.meta_selector.select_alpha( policy, gradients, self.task_prop, N1, iteration, self.budget) policy.update(alpha * gradients['grad_theta']) # PERFORM THIRD STEP features, actions, rewards, J_hat, gradients = self.get_trajectories_data( policy, N3, parallel=parallel) J_journey += J_hat * N3 self.budget += N1 * (J_hat - prevJ ) # B += J(theta', sigma) - J(theta, sigma) prevJ = J_hat beta, N3, safe = self.meta_selector.select_beta( policy, gradients, self.task_prop, N3, iteration, self.budget) # policy.update_w(beta * gradients['gradDeltaW']) # COMPUTE OPTIMAL SIGMA_0 USING THE BOUND d = policy.penaltyCoeffSigma(self.task_prop.R, self.task_prop.M, self.task_prop.gamma, self.task_prop.volume) if self.budget / N2 >= -(gradients['grad_w']**2) / (4 * d): beta_minus = (1 - math.sqrt(1 - (4 * d * (-self.budget / N2)) / (gradients['grad_w']**2))) / (2 * d) beta_plus = (1 + math.sqrt(1 - (4 * d * (-self.budget / N2)) / (gradients['grad_w']**2))) / (2 * d) w_det = min(policy.w + beta_minus * gradients['grad_w'], policy.w + beta_plus * gradients['grad_w']) policy_low_variance = policies.ExpGaussPolicy( np.copy(policy.theta_mat), w_det) else: policy_low_variance = policy newJ_det = self.estimate_policy_performance(policy_low_variance, N2, parallel=parallel) J_journey += newJ_det * N2 self.budget += N2 * (newJ_det - prevJ_det ) # B += J(theta', 0) - J(theta, 0) prevJ_det = newJ_det # beta, N3, safe = self.meta_selector.select_beta(policy, gradients, self.task_prop, N3, iteration, self.budget) policy.update_w(beta * gradients['gradDeltaW']) N_old = N J_journey /= N #Check if done N_tot += N if N_tot >= self.constr.N_tot: print('Total N reached') print('End experiment') break # def signal_handler(signal, frame): # self.save_data(filename) # sys.exit(0) # # #Manual stop # signal.signal(signal.SIGINT, signal_handler) # SAVE DATA self.save_data(filename)
def run( self, policy, use_local_stats=False, # Update task prop only with local stats parallel=True, filename=generate_filename(), verbose=False): self.use_local_stats = True self.initial_configuration = self.get_param_list(locals()) self.estimator = Estimators(self.task_prop, self.constr) N = N_old = self.constr.N_min # Total number of trajectories to take in this iteration N1, N2, N3 = self.split_trajectory_count(N) #Multiprocessing preparation if parallel: self._enable_parallel() # COMPUTE BASELINES features, actions, rewards, prevJ, gradients = self.get_trajectories_data( policy, N, parallel=parallel) prevJ_det = self.estimate_policy_performance(policy, N, parallel=parallel, deterministic=True, get_min=False) #Learning iteration = 0 N_tot = 0 start_time = time.time() J_hat = prevJ J_journey = (2 * prevJ + prevJ_det) / 3 while iteration < self.constr.max_iter: iteration += 1 J_det_exact = self.evaluate(policy, deterministic=True) self.make_checkpoint(locals()) # CHECKPOINT BEFORE SIGMA STEP J_journey = 0 # PRINT if verbose: if iteration % 50 == 1: print( 'IT\tN\t\tJ\t\t\tJ_DET\t\t\tTHETA\t\tSIGMA\t\t\tBUDGET' ) print(iteration, '\t', N, '\t', J_hat, '\t', prevJ_det, '\t', policy.get_theta(), '\t', policy.sigma, '\t', self.budget / N, '\t', time.time() - start_time) start_time = time.time() # PERFORM FIRST STEP features, actions, rewards, J_hat, gradients = self.get_trajectories_data( policy, N1, parallel=parallel) J_journey += J_hat * N1 if iteration > 1: self.budget += N3 * ( J_hat - prevJ) # B += J(theta, sigma') - J(theta, sigma) prevJ = J_hat alpha, N1, safe = self.meta_selector.select_alpha( policy, gradients, self.task_prop, N1, iteration, self.budget) policy.update(alpha * gradients['grad_theta']) # PERFORM SECOND STEP newJ_det = self.estimate_policy_performance(policy, N2, parallel=parallel, deterministic=True) J_journey += newJ_det * N2 self.budget += N2 * (newJ_det - prevJ_det ) # B += J(theta', 0) - J(theta, 0) prevJ_det = newJ_det # PERFORM THIRD STEP features, actions, rewards, J_hat, gradients = self.get_trajectories_data( policy, N3, parallel=parallel) J_journey += J_hat * N3 self.budget += N1 * (J_hat - prevJ ) # B += J(theta', sigma) - J(theta, sigma) prevJ = J_hat beta, N3, safe = self.meta_selector.select_beta( policy, gradients, self.task_prop, N3, iteration, self.budget) policy.update_w(beta * gradients['gradDeltaW']) N_old = N J_journey /= N #Check if done N_tot += N if N_tot >= self.constr.N_tot: print('Total N reached') print('End experiment') break # SAVE DATA self.save_data(filename)
def run(self, policy, use_local_stats=False, parallel=True, filename=generate_filename(), verbose=False): self.use_local_stats = True self.initial_configuration = self.get_param_list(locals()) self.estimator = Estimators(self.task_prop, self.constr) N = self.constr.N_min # Total number of trajectories to take in this iteration #Multiprocessing preparation if parallel: self._enable_parallel() # COMPUTE BASELINES features, actions, rewards, prevJ, gradients = self.get_trajectories_data( policy, N, parallel=parallel) J_baseline = prevJ #Learning iteration = 0 N_tot = 0 J_journey = prevJ start_time = time.time() N1 = N // 2 N2 = N - N1 while iteration < self.constr.max_iter: iteration += 1 J_det_exact = self.evaluate(policy, deterministic=True) self.make_checkpoint(locals()) # CHECKPOINT BEFORE SIGMA STEP J_journey = 0 # PRINT if verbose: if iteration % 50 == 1: print( 'IT\tN\t\tJ\t\t\tJ_DET\t\t\tTHETA\t\tSIGMA\t\t\tBUDGET' ) print(iteration, '\t', N, '\t', prevJ, '\t', 0, '\t', policy.get_theta(), '\t', policy.sigma, '\t', self.budget / N, '\t', time.time() - start_time) start_time = time.time() # PERFORM FIRST STEP features, actions, rewards, prevJ, gradients = self.get_trajectories_data( policy, N1, parallel=parallel) J_journey += prevJ * N1 budget = prevJ - J_baseline alpha, N1, safe = self.meta_selector.select_alpha(policy, gradients, self.task_prop, N1, iteration, budget=budget) policy.update(alpha * gradients['grad_theta']) # PERFORM SECOND STEP features, actions, rewards, prevJ, gradients = self.get_trajectories_data( policy, N2, parallel=parallel) J_journey += prevJ * N2 budget = prevJ - J_baseline beta, _, _ = self.meta_selector.select_beta(policy, gradients, self.task_prop, N2, iteration, budget=budget) policy.update_w(beta * gradients['gradDeltaW']) J_journey /= N N_tot += N if N_tot >= self.constr.N_tot: print('Total N reached\nEnd experiment') break # SAVE DATA self.save_data(filename)