class MBExperiment: def __init__(self, params): """Initializes class instance. Argument: params (DotMap): A DotMap containing the following: .sim_cfg: .env (gym.env): Environment for this experiment .task_hor (int): Task horizon .stochastic (bool): (optional) If True, agent adds noise to its actions. Must provide noise_std (see below). Defaults to False. .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I) will be added. .exp_cfg: .ntrain_iters (int): Number of training iterations to be performed. .nrollouts_per_iter (int): (optional) Number of rollouts done between training iterations. Defaults to 1. .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1. .policy (controller): Policy that will be trained. .log_cfg: .logdir (str): Parent of directory path where experiment data will be saved. Experiment will be saved in logdir/<date+time of experiment start> .nrecord (int): (optional) Number of rollouts to record for every iteration. Defaults to 0. .neval (int): (optional) Number of rollouts for performance evaluation. Defaults to 1. """ self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.") self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.") if params.sim_cfg.get("stochastic", False): self.agent = Agent( DotMap( env=self.env, noisy_actions=True, noise_stddev=get_required_argument( params.sim_cfg, "noise_std", "Must provide noise standard deviation in the case of a stochastic environment." ))) else: self.agent = Agent(DotMap(env=self.env, noisy_actions=False)) self.ntrain_iters = get_required_argument( params.exp_cfg, "ntrain_iters", "Must provide number of training iterations.") self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1) self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1) self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.") self.logdir = os.path.join( get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."), strftime("%Y-%m-%d--%H_%M_%S", localtime()) # Compatiable format in windows ) self.nrecord = params.log_cfg.get("nrecord", 0) self.neval = params.log_cfg.get("neval", 1) def run_experiment(self): """Perform experiment. """ os.makedirs(self.logdir, exist_ok=True) traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] # Perform initial rollouts samples = [] for i in range(self.ninit_rollouts): samples.append(self.agent.sample(self.task_hor, self.policy)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) if self.ninit_rollouts > 0: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) # Training loop for i in range(self.ntrain_iters): print( "####################################################################" ) print("Starting training iteration %d." % (i + 1)) iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) os.makedirs(iter_dir, exist_ok=True) samples = [] for j in range(self.nrecord): samples.append( self.agent.sample( self.task_hor, self.policy, os.path.join(iter_dir, "rollout%d.mp4" % j))) if self.nrecord > 0: for item in filter(lambda f: f.endswith(".json"), os.listdir(iter_dir)): os.remove(os.path.join(iter_dir, item)) for j in range( max(self.neval, self.nrollouts_per_iter) - self.nrecord): samples.append(self.agent.sample(self.task_hor, self.policy)) print("Rewards obtained:", [sample["reward_sum"] for sample in samples[:self.neval]]) traj_obs.extend([ sample["obs"] for sample in samples[:self.nrollouts_per_iter] ]) traj_acs.extend( [sample["ac"] for sample in samples[:self.nrollouts_per_iter]]) traj_rets.extend( [sample["reward_sum"] for sample in samples[:self.neval]]) traj_rews.extend([ sample["rewards"] for sample in samples[:self.nrollouts_per_iter] ]) samples = samples[:self.nrollouts_per_iter] self.policy.dump_logs(self.logdir, iter_dir) savemat( os.path.join(self.logdir, "logs.mat"), { "observations": traj_obs, "actions": traj_acs, "returns": traj_rets, "rewards": traj_rews }) # Delete iteration directory if not used if len(os.listdir(iter_dir)) == 0: os.rmdir(iter_dir) if i < self.ntrain_iters - 1: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples])
class MBExperiment: def __init__(self, params): """Initializes class instance. Argument: params (DotMap): A DotMap containing the following: .sim_cfg: .env (gym.env): Environment for this experiment .task_hor (int): Task horizon .stochastic (bool): (optional) If True, agent adds noise to its actions. Must provide noise_std (see below). Defaults to False. .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I) will be added. .exp_cfg: .ntrain_iters (int): Number of training iterations to be performed. .nrollouts_per_iter (int): (optional) Number of rollouts done between training iterations. Defaults to 1. .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1. .policy (controller): Policy that will be trained. .demo_low_cost (int): Minimum allowed cost for demonstrations .demo_high_cost (int): Maximum allowed cost for demonstrations .num_demos (int): Number of demonstrations .ss_buffer_size (int): Size of buffer of safe states that density model is trained on .gym_robotics (bool): Indicates whether env is a gym robotics env, in which case there are some small differences in data loading and environment parameters .log_cfg: .logdir (str): Parent of directory path where experiment data will be saved. Experiment will be saved in logdir/<date+time of experiment start> .nrecord (int): (optional) Number of rollouts to record for every iteration. Defaults to 0. .neval (int): (optional) Number of rollouts for performance evaluation. Defaults to 1. """ self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.") self.demo_low_cost = params.exp_cfg.demo_low_cost self.demo_high_cost = params.exp_cfg.demo_high_cost self.num_demos = params.exp_cfg.num_demos self.ss_buffer_size = params.exp_cfg.ss_buffer_size self.gym_robotics = params.exp_cfg.gym_robotics self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.") if params.sim_cfg.get("stochastic", False): self.agent = Agent( DotMap( env=self.env, noisy_actions=True, noise_stddev=get_required_argument( params.sim_cfg, "noise_std", "Must provide noise standard deviation in the case of a stochastic environment." ))) else: self.agent = Agent(DotMap(env=self.env, noisy_actions=False)) self.ntrain_iters = get_required_argument( params.exp_cfg, "ntrain_iters", "Must provide number of training iterations.") self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1) self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.") self.value = get_required_argument(params.exp_cfg, "value", "Must provide a value function.") self.target = get_required_argument(params.exp_cfg, "value_target", "Must provide a value function.") self.value.target = self.target self.logdir = os.path.join( get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."), strftime("%Y-%m-%d--%H:%M:%S", localtime())) self.nrecord = params.log_cfg.get("nrecord", 0) self.neval = params.log_cfg.get("neval", 1) self.load_samples = params.exp_cfg.get("load_samples", True) self.demo_load_path = params.exp_cfg.get("demo_load_path", None) self.use_value = params.exp_cfg.get("use_value", True) self.teacher = params.exp_cfg.get("teacher") self.stabilizable_observations = [] self.tvalue_schedule = LinearSchedule(3, 3, 500) self.stabilized_model = knn(n_neighbors=1) self.target_update_freq = 1 def run_experiment(self): """Perform experiment. """ os.makedirs(self.logdir, exist_ok=True) traj_obs, traj_acs, traj_rets, traj_costs = [], [], [], [] # Perform initial rollouts samples = [] if self.load_samples: if not self.gym_robotics: samples = load_teacher_samples_gym(self.demo_load_path, self.env, self.demo_low_cost, self.demo_high_cost, self.num_demos) else: samples = load_teacher_samples_gym_robotics( self.demo_load_path, self.env, self.demo_low_cost, self.demo_high_cost, self.num_demos) for i in range(len(samples)): traj_obs.append(samples[i]["obs"]) traj_acs.append(samples[i]["ac"]) traj_costs.append(samples[i]["costs"]) self.stabilizable_observations.extend( samples[i]["stabilizable_obs"]) else: for i in range(self.num_demos): s = self.teacher.get_rollout() samples.append(s) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_costs.append(samples[-1]["costs"]) self.stabilizable_observations.extend( samples[-1]["stabilizable_obs"]) # Fit density model to demonstrations if self.stabilized_model is not None: self.stabilized_model.fit(np.array(self.stabilizable_observations)) else: self.stabilized_model = None self.policy.set_stabilized_model(self.stabilized_model) if self.num_demos > 0: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["costs"] for sample in samples], np.array(self.stabilizable_observations)) if self.use_value: # Train value function using teacher rollouts self.value.train([sample["obs"][:-1] for sample in samples], [sample["costs"] for sample in samples], [sample["obs"][1:] for sample in samples], [sample["values"] for sample in samples], use_TD=False, terminal_states=[ sample["obs"][-1] for sample in samples ], copy_target=True) demo_samples = deepcopy(samples) # Training loop for i in range(self.ntrain_iters): print( "####################################################################" ) print("Starting training iteration %d." % (i + 1)) iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) os.makedirs(iter_dir, exist_ok=True) samples = [] for j in range(self.nrecord): samples.append( self.agent.sample( self.task_hor, self.policy, os.path.join(iter_dir, "rollout%d.mp4" % j))) if self.nrecord > 0: for item in filter(lambda f: f.endswith(".json"), os.listdir(iter_dir)): os.remove(os.path.join(iter_dir, item)) for j in range( max(self.neval, self.nrollouts_per_iter) - self.nrecord): samples.append(self.agent.sample(self.task_hor, self.policy)) print("Costs obtained:", [sample["cost_sum"] for sample in samples[:self.neval]]) traj_obs.extend([ sample["obs"] for sample in samples[:self.nrollouts_per_iter] ]) traj_acs.extend( [sample["ac"] for sample in samples[:self.nrollouts_per_iter]]) traj_rets.extend( [sample["cost_sum"] for sample in samples[:self.neval]]) traj_costs.extend([ sample["costs"] for sample in samples[:self.nrollouts_per_iter] ]) samples = samples[:self.nrollouts_per_iter] self.policy.dump_logs(self.logdir, iter_dir) if self.use_value: self.value.dump_logs(self.logdir, iter_dir) savemat( os.path.join(self.logdir, "logs.mat"), { "observations": traj_obs, "actions": traj_acs, "returns": traj_rets, "costs": traj_costs }) # Delete iteration directory if not used if len(os.listdir(iter_dir)) == 0: os.rmdir(iter_dir) if i < self.ntrain_iters - 1: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["costs"] for sample in samples], np.array(self.stabilizable_observations)) if self.gym_robotics: current_stabilizable_obs = np.array([ sample["stabilizable_obs"] for sample in samples ]).reshape((-1, self.env.observation_space. spaces['observation'].low.size)) else: current_stabilizable_obs = np.array([ sample["stabilizable_obs"] for sample in samples ]).reshape((-1, self.env.observation_space.shape[0])) if self.use_value: copy_target = i % self.target_update_freq == 0 # Train value function using teacher rollouts self.value.train( [sample["obs"][:-1] for sample in samples], [sample["costs"] for sample in samples], [sample["obs"][1:] for sample in samples], [sample["values"] for sample in samples], use_TD=True, terminal_states=[ sample["obs"][-1] for sample in samples ], copy_target=copy_target) if len(current_stabilizable_obs): current_stabilizable_obs = [ c for c in current_stabilizable_obs ] self.stabilizable_observations.extend( current_stabilizable_obs) self.stabilizable_observations = self.stabilizable_observations[ -self.ss_buffer_size:] if self.stabilized_model is not None: self.stabilized_model.fit( np.array(self.stabilizable_observations)) self.policy.set_stabilized_model(self.stabilized_model) pickle.dump( self.stabilized_model, open(os.path.join(self.logdir, "stabilized_model.pkl"), "wb"))
class MBExperiment: def __init__(self, params): """Initializes class instance. Argument: params (DotMap): A DotMap containing the following: .sim_cfg: .env (gym.env): Environment for this experiment .task_hor (int): Task horizon .stochastic (bool): (optional) If True, agent adds noise to its actions. Must provide noise_std (see below). Defaults to False. .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I) will be added. .exp_cfg: .ntrain_iters (int): Number of training iterations to be performed. .nrollouts_per_iter (int): (optional) Number of rollouts done between training iterations. Defaults to 1. .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1. .policy (controller): Policy that will be trained. .log_cfg: .logdir (str): Parent of directory path where experiment data will be saved. Experiment will be saved in logdir/<date+time of experiment start> .nrecord (int): (optional) Number of rollouts to record for every iteration. Defaults to 0. .neval (int): (optional) Number of rollouts for performance evaluation. Defaults to 1. """ self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.") self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.") if params.sim_cfg.get("stochastic", False): self.agent = Agent( DotMap( env=self.env, noisy_actions=True, noise_stddev=get_required_argument( params.sim_cfg, "noise_std", "Must provide noise standard deviation in the case of a stochastic environment." ))) else: self.agent = Agent(DotMap(env=self.env, noisy_actions=False)) self.ntrain_iters = get_required_argument( params.exp_cfg, "ntrain_iters", "Must provide number of training iterations.") self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1) self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1) self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.") self.logdir = os.path.join( get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."), strftime("%Y-%m-%d--%H:%M:%S", localtime())) self.nrecord = params.log_cfg.get("nrecord", 0) self.neval = params.log_cfg.get("neval", 1) def run_experiment(self): """Perform experiment. """ os.makedirs(self.logdir, exist_ok=True) traj_obs, traj_acs, traj_rets, traj_rews, traj_cost = [], [], [], [], [] # Perform initial rollouts # uses policy.act() to come up with action, should be uniform random samples = [] print(f"Acting randomly for {self.ninit_rollouts} rollouts") for i in range(self.ninit_rollouts): samples.append(self.agent.sample(self.task_hor, self.policy)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) traj_cost.append(samples[-1]["cost"]) print(f"{len(traj_rews)} Number of initial samples") # jsw: "Initialize data D with a random controller for one trial" if self.ninit_rollouts > 0: print("Training on random actions") # jsw this trains the NN model for the very first time # policy is of type Controller, which MPC inherits from self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) # Plot the train/validation curves self.policy.model.plot_train_val(self.logdir) # Training loop: # jsw: "for Trial k = 1 to K do:" for i in range(self.ntrain_iters): print( "####################################################################" ) print("Starting training iteration %d." % (i + 1)) # jsw note that NN model trained above, initialized with action from a random policy iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) os.makedirs(iter_dir, exist_ok=True) samples = [] for j in range(self.nrecord): samples.append( ##################### # This call does a lot! uses policy.act() to come up with action # policy.act() solves open loop finite time problem # Uses environment to actually act. ##################### self.agent.sample( self.task_hor, self.policy, os.path.join(iter_dir, "rollout%d.mp4" % j))) if self.nrecord > 0: for item in filter(lambda f: f.endswith(".json"), os.listdir(iter_dir)): os.remove(os.path.join(iter_dir, item)) # jsw: actually executing action from optimal actions, log it # sample() calls Agent.py's sample which gets the best action from the policy, and # uses the environment to see what happens when you use that action. it repeats # this for the entire horizon. Actually exploring the true environment for j in range( max(self.neval, self.nrollouts_per_iter) - self.nrecord): samples.append( # jsw for time t = 0 to task horizon self.agent.sample(self.task_hor, self.policy)) print("Rewards obtained:", [sample["reward_sum"] for sample in samples[:self.neval]]) traj_obs.extend([ sample["obs"] for sample in samples[:self.nrollouts_per_iter] ]) traj_acs.extend( [sample["ac"] for sample in samples[:self.nrollouts_per_iter]]) traj_rets.extend( [sample["reward_sum"] for sample in samples[:self.neval]]) traj_rews.extend([ sample["rewards"] for sample in samples[:self.nrollouts_per_iter] ]) traj_cost.extend([ sample["cost"] for sample in samples[:self.nrollouts_per_iter] ]) samples = samples[:self.nrollouts_per_iter] self.policy.dump_logs(self.logdir, iter_dir) savemat( os.path.join(self.logdir, "logs.mat"), { "observations": traj_obs, "actions": traj_acs, "returns": traj_rets, "rewards": traj_rews, "cost": traj_cost, }) # Delete iteration directory if not used if len(os.listdir(iter_dir)) == 0: os.rmdir(iter_dir) if i < self.ntrain_iters - 1: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples])
class MBExperiment: def __init__(self, params): """Initializes class instance. Argument: params (DotMap): A DotMap containing the following: .sim_cfg: .env (gym.env): Environment for this experiment .task_hor (int): Task horizon .stochastic (bool): (optional) If True, agent adds noise to its actions. Must provide noise_std (see below). Defaults to False. .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I) will be added. .exp_cfg: .ntrain_iters (int): Number of training iterations to be performed. .nrollouts_per_iter (int): (optional) Number of rollouts done between training iterations. Defaults to 1. .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1. .policy (controller): Policy that will be trained. .log_cfg: .logdir (str): Parent of directory path where experiment data will be saved. Experiment will be saved in logdir/<date+time of experiment start> .nrecord (int): (optional) Number of rollouts to record for every iteration. Defaults to 0. .neval (int): (optional) Number of rollouts for performance evaluation. Defaults to 1. """ self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.") self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.") self._params = params params.sim_cfg.misc = copy.copy(params) if params.sim_cfg.get("stochastic", False): self.agent = Agent( DotMap( env=self.env, noisy_actions=True, noise_stddev=get_required_argument( params.sim_cfg, "noise_std", "Must provide noise standard deviation in the case of a stochastic environment." ), params=params)) else: self.agent = Agent( DotMap(env=self.env, noisy_actions=False, params=params)) self.ntrain_iters = get_required_argument( params.exp_cfg, "ntrain_iters", "Must provide number of training iterations.") self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1) self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1) self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.") self.logdir = os.path.join( get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."), strftime("%Y-%m-%d--%H-%M-%S", localtime())) logger.set_file_handler(path=self.logdir) logger.info('Starting the experiments') self.nrecord = params.log_cfg.get("nrecord", 0) self.neval = params.log_cfg.get("neval", 1) self.delay_hor = params.sim_cfg.get("delay_hor", 0) def run_experiment(self): """Perform experiment. """ os.makedirs(self.logdir, exist_ok=True) traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] test_traj_obs, test_traj_acs, test_traj_rets = [], [], [] episode_iter_id = [] # Perform initial rollouts samples = [] needed_num_steps = self.ninit_rollouts * self.task_hor finished_num_steps = 0 """ # TODO DEBUG needed_num_steps = 64 self.task_hor = 64 """ while True: samples.append( self.agent.sample(self.task_hor, self.policy, self.delay_hor)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) finished_num_steps += len(samples[-1]["ac"]) print(finished_num_steps) if finished_num_steps >= needed_num_steps: break if self.ninit_rollouts > 0: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) # Training loop for i in range(self.ntrain_iters): logger.info( "####################################################################" ) logger.info("Starting training iteration %d." % (i + 1)) iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) os.makedirs(iter_dir, exist_ok=True) samples = [] assert self.nrecord == 0 needed_num_steps = self.task_hor * \ (max(self.neval, self.nrollouts_per_iter) - self.nrecord) finished_num_steps = 0 while True: samples.append( self.agent.sample(self.task_hor, self.policy, self.delay_hor)) finished_num_steps += len(samples[-1]["ac"]) if finished_num_steps >= needed_num_steps: break logger.info("Rewards obtained: {}".format( [sample["reward_sum"] for sample in samples[:self.neval]])) # test the policy if needed if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0: test_data = [] for _ in range(5): test_data.append( self.agent.sample(self.task_hor, self.policy, test_policy=True, average=False)) test_traj_rets.extend([ np.mean([ i_test_data["reward_sum"] for i_test_data in test_data ]) ]) test_traj_obs.extend( [i_test_data["obs"] for i_test_data in test_data]) test_traj_acs.extend( [i_test_data["ac"] for i_test_data in test_data]) traj_obs.extend([sample["obs"] for sample in samples]) traj_acs.extend([sample["ac"] for sample in samples]) traj_rets.extend([sample["reward_sum"] for sample in samples]) traj_rews.extend([sample["rewards"] for sample in samples]) episode_iter_id.extend([i] * len(samples)) samples = samples[:self.nrollouts_per_iter] self.policy.dump_logs(self.logdir, iter_dir) savemat( os.path.join(self.logdir, "logs.mat"), { "observations": traj_obs, "actions": traj_acs, "returns": traj_rets, "rewards": traj_rews, "test_returns": test_traj_rets, "test_obs": test_traj_obs, "test_acs": test_traj_acs, 'episode_iter_id': episode_iter_id }) # Delete iteration directory if not used if len(os.listdir(iter_dir)) == 0: os.rmdir(iter_dir) if i < self.ntrain_iters - 1: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples])
class MBExperiment: def __init__(self, params): """Initializes class instance. Argument: params (DotMap): A DotMap containing the following: .sim_cfg: .env (gym.env): Environment for this experiment .task_hor (int): Task horizon .stochastic (bool): (optional) If True, agent adds noise to its actions. Must provide noise_std (see below). Defaults to False. .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I) will be added. .exp_cfg: .ntrain_iters (int): Number of training iterations to be performed. .nrollouts_per_iter (int): (optional) Number of rollouts done between training iterations. Defaults to 1. .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1. .policy (controller): Policy that will be trained. .log_cfg: .logdir (str): Parent of directory path where experiment data will be saved. Experiment will be saved in logdir/<date+time of experiment start> .nrecord (int): (optional) Number of rollouts to record for every iteration. Defaults to 0. .neval (int): (optional) Number of rollouts for performance evaluation. Defaults to 1. """ self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.") self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.") self._params = params # params.sim_cfg.misc = copy.copy(params) if params.sim_cfg.get("stochastic", False): self.agent = Agent( DotMap( env=self.env, noisy_actions=True, noise_stddev=get_required_argument( params.sim_cfg, "noise_std", "Must provide noise standard deviation in the case of a stochastic environment." ), params=params)) else: self.agent = Agent( DotMap(env=self.env, noisy_actions=False, params=params)) self.ntrain_iters = get_required_argument( params.exp_cfg, "ntrain_iters", "Must provide number of training iterations.") self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1) self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1) self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.") self.logdir = os.path.join( get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."), strftime("%Y-%m-%d--%H:%M:%S", localtime())) logger.set_file_handler(path=self.logdir) logger.info('Starting the experiments') self.nrecord = params.log_cfg.get("nrecord", 0) self.neval = params.log_cfg.get("neval", 1) def run_experiment(self): """Perform experiment. """ os.makedirs(self.logdir, exist_ok=True) traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] test_traj_obs, test_traj_acs, test_traj_rets = [], [], [] episode_iter_id = [] # Perform initial rollouts samples = [] needed_num_steps = self.ninit_rollouts * self.task_hor finished_num_steps = 0 """ # TODO DEBUG needed_num_steps = 64 self.task_hor = 64 """ # logger.info("Collecting n_init rollout before policy trainning") while True: samples.append(self.agent.sample(self.task_hor, self.policy)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) finished_num_steps += len(samples[-1]["ac"]) if finished_num_steps >= needed_num_steps: break if self.ninit_rollouts > 0: # logger.info("Performing init policy trianing") self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) # Training loop for i in range(self.ntrain_iters): logger.info( "####################################################################" ) logger.info("Starting training iteration %d." % (i + 1)) iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) os.makedirs(iter_dir, exist_ok=True) samples = [] assert self.nrecord == 0 needed_num_steps = self.task_hor * \ (max(self.neval, self.nrollouts_per_iter) - self.nrecord) finished_num_steps = 0 while True: samples.append(self.agent.sample(self.task_hor, self.policy)) finished_num_steps += len(samples[-1]["ac"]) if finished_num_steps >= needed_num_steps: break logger.info("Rewards obtained: {}".format( [sample["reward_sum"] for sample in samples[:self.neval]])) # test the policy if needed # comment out by ShenShuo # passing while config to misc is much too messy # we juse comment it out, if need testing policy, we consider a smarter way to pass # test_policy arg # if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0: # test_data = [] # for _ in range(5): # test_data.append( # self.agent.sample(self.task_hor, self.policy, # test_policy=True, average=False) # ) # test_traj_rets.extend([ # np.mean([i_test_data["reward_sum"] for i_test_data in test_data]) # ]) # test_traj_obs.extend( # [i_test_data["obs"] for i_test_data in test_data] # ) # test_traj_acs.extend( # [i_test_data["ac"] for i_test_data in test_data] # ) traj_obs.extend([sample["obs"] for sample in samples]) traj_acs.extend([sample["ac"] for sample in samples]) traj_rets.extend([sample["reward_sum"] for sample in samples]) traj_rews.extend([sample["rewards"] for sample in samples]) episode_iter_id.extend([i] * len(samples)) samples = samples[:self.nrollouts_per_iter] self.policy.dump_logs(self.logdir, iter_dir) savemat( os.path.join(self.logdir, "logs.mat"), { "observations": traj_obs, "actions": traj_acs, "returns": traj_rets, "rewards": traj_rews, "test_returns": test_traj_rets, "test_obs": test_traj_obs, "test_acs": test_traj_acs, 'episode_iter_id': episode_iter_id }) # Delete iteration directory if not used if len(os.listdir(iter_dir)) == 0: os.rmdir(iter_dir) # train policy and model together if i < self.ntrain_iters - 1: self.policy.train([sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples]) if i % 10 == 0: self.log_model_predictions(i) def log_model_predictions(self, itr): import matplotlib.pyplot as plt action_sequence = self.policy.sample_random_action_sequences( num_sequences=1, horizon=200) #20 reacher action_sequence = action_sequence[0] mpe, true_states, pred_states = self.agent.calculate_mean_prediction_error( action_sequence, self.policy) savemat(os.path.join(self.logdir, str(itr), "states.mat"), { "t_state": true_states, "p_state": pred_states, }) assert self.env.observation_space.shape[0] == true_states.shape[ 1] == pred_states.shape[1] ob_dim = self.env.observation_space.shape[0] self.fig = plt.figure(figsize=(10, 1 * ob_dim)) if ob_dim > 16: ob_dim = 16 if ob_dim % 2 == 1: ob_dim -= 1 # plot the predictions self.fig.clf() for i in range(ob_dim): plt.subplot(ob_dim / 2, 2, i + 1) plt.plot(true_states[:, i], 'g') plt.plot(pred_states[:, i], 'r') self.fig.suptitle('MPE: ' + str(mpe)) self.fig.savefig(self.logdir + '/itr_' + str(itr) + '_predictions.png', dpi=200, bbox_inches='tight')